1 /***********************************************************************
2
3 Copyright (c) 1995, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2009, 2016, Percona Inc.
5
6 Portions of this file contain modifications contributed and copyrighted
7 by Percona Inc.. Those modifications are
8 gratefully acknowledged and are described briefly in the InnoDB
9 documentation. The contributions by Percona Inc. are incorporated with
10 their permission, and subject to the conditions contained in the file
11 COPYING.Percona.
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation. The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License, version 2.0, for more details.
28
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32
33 ***********************************************************************/
34
35 /**************************************************//**
36 @file os/os0file.cc
37 The interface to the operating system file i/o primitives
38
39 Created 10/21/1995 Heikki Tuuri
40 *******************************************************/
41
42 #ifndef UNIV_INNOCHECKSUM
43
44 #include "ha_prototypes.h"
45 #include "sql_const.h"
46 #include "system_key.h"
47
48 #include "os0file.h"
49
50 #include "fil0crypt.h"
51 #include "system_key.h"
52
53 #ifdef UNIV_NONINL
54 #include "os0file.ic"
55 #endif
56
57 #include "page0page.h"
58 #include "srv0srv.h"
59 #include "srv0start.h"
60 #include "fil0fil.h"
61 #include "btr0types.h"
62 #include "trx0trx.h"
63 #ifndef UNIV_HOTBACKUP
64 # include "os0event.h"
65 # include "os0thread.h"
66 #else /* !UNIV_HOTBACKUP */
67 # ifdef _WIN32
68 /* Add includes for the _stat() call to compile on Windows */
69 # include <sys/types.h>
70 # include <sys/stat.h>
71 # include <errno.h>
72 # endif /* _WIN32 */
73 #endif /* !UNIV_HOTBACKUP */
74
75 #include <vector>
76 #include <functional>
77
78 #include "fil0crypt.h"
79
80 #ifdef LINUX_NATIVE_AIO
81 #include <libaio.h>
82 #endif /* LINUX_NATIVE_AIO */
83
84 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
85 # include <fcntl.h>
86 # include <linux/falloc.h>
87 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
88
89 #include <lz4.h>
90 #include <zlib.h>
91
92 #ifdef UNIV_DEBUG
93 /** Set when InnoDB has invoked exit(). */
94 bool innodb_calling_exit;
95 #include <ut0ut.h>
96 #endif /* UNIV_DEBUG */
97
98 #include <my_aes.h>
99 #include <my_rnd.h>
100 #include <mysqld.h>
101 #include "fil0crypt.h"
102 #include <mysql/service_mysql_keyring.h>
103 #include "buf0buf.h"
104
105 /** Insert buffer segment id */
106 static const ulint IO_IBUF_SEGMENT = 0;
107
108 /** Log segment id */
109 static const ulint IO_LOG_SEGMENT = 1;
110
111 /** Number of retries for partial I/O's */
112 static const ulint NUM_RETRIES_ON_PARTIAL_IO = 10;
113
114 /** Blocks for doing IO, used in the transparent compression
115 and encryption code. */
116 struct Block {
117 /** Default constructor */
BlockBlock118 Block() : m_ptr(), m_in_use() { }
119
120 byte* m_ptr;
121
122 byte pad[CACHE_LINE_SIZE - sizeof(ulint)];
123 lock_word_t m_in_use;
124 };
125
126 /** For storing the allocated blocks */
127 typedef std::vector<Block> Blocks;
128
129 /** Block collection */
130 static Blocks* block_cache;
131
132 /** Number of blocks to allocate for sync read/writes */
133 static const size_t MAX_BLOCKS = 128;
134
135 /** Block buffer size */
136 #define BUFFER_BLOCK_SIZE ((ulint)(UNIV_PAGE_SIZE * 1.3))
137
138 /** Disk sector size of aligning write buffer for DIRECT_IO */
139 static ulint os_io_ptr_align = UNIV_SECTOR_SIZE;
140
141 /* This specifies the file permissions InnoDB uses when it creates files in
142 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
143 my_umask */
144
145 #ifndef _WIN32
146 /** Umask for creating files */
147 static ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
148 #else
149 /** Umask for creating files */
150 static ulint os_innodb_umask = 0;
151
152 /* On Windows when using native AIO the number of AIO requests
153 that a thread can handle at a given time is limited to 32
154 i.e.: SRV_N_PENDING_IOS_PER_THREAD */
155 #define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD
156
157 #endif /* _WIN32 */
158
159 #ifndef UNIV_HOTBACKUP
160
161 /** In simulated aio, merge at most this many consecutive i/os */
162 static const ulint OS_AIO_MERGE_N_CONSECUTIVE = 64;
163
164 /** Flag indicating if the page_cleaner is in active state. */
165 extern bool buf_page_cleaner_is_active;
166
167 /**********************************************************************
168
169 InnoDB AIO Implementation:
170 =========================
171
172 We support native AIO for Windows and Linux. For rest of the platforms
173 we simulate AIO by special IO-threads servicing the IO-requests.
174
175 Simulated AIO:
176 ==============
177
178 On platforms where we 'simulate' AIO, the following is a rough explanation
179 of the high level design.
180 There are four io-threads (for ibuf, log, read, write).
181 All synchronous IO requests are serviced by the calling thread using
182 os_file_write/os_file_read. The Asynchronous requests are queued up
183 in an array (there are four such arrays) by the calling thread.
184 Later these requests are picked up by the IO-thread and are serviced
185 synchronously.
186
187 Windows native AIO:
188 ==================
189
190 If srv_use_native_aio is not set then Windows follow the same
191 code as simulated AIO. If the flag is set then native AIO interface
192 is used. On windows, one of the limitation is that if a file is opened
193 for AIO no synchronous IO can be done on it. Therefore we have an
194 extra fifth array to queue up synchronous IO requests.
195 There are innodb_file_io_threads helper threads. These threads work
196 on the four arrays mentioned above in Simulated AIO. No thread is
197 required for the sync array.
198 If a synchronous IO request is made, it is first queued in the sync
199 array. Then the calling thread itself waits on the request, thus
200 making the call synchronous.
201 If an AIO request is made the calling thread not only queues it in the
202 array but also submits the requests. The helper thread then collects
203 the completed IO request and calls completion routine on it.
204
205 Linux native AIO:
206 =================
207
208 If we have libaio installed on the system and innodb_use_native_aio
209 is set to true we follow the code path of native AIO, otherwise we
210 do simulated AIO.
211 There are innodb_file_io_threads helper threads. These threads work
212 on the four arrays mentioned above in Simulated AIO.
213 If a synchronous IO request is made, it is handled by calling
214 os_file_write/os_file_read.
215 If an AIO request is made the calling thread not only queues it in the
216 array but also submits the requests. The helper thread then collects
217 the completed IO request and calls completion routine on it.
218
219 **********************************************************************/
220
221
222 #ifdef UNIV_PFS_IO
223 /* Keys to register InnoDB I/O with performance schema */
224 mysql_pfs_key_t innodb_data_file_key;
225 mysql_pfs_key_t innodb_log_file_key;
226 mysql_pfs_key_t innodb_temp_file_key;
227 mysql_pfs_key_t innodb_bmp_file_key;
228 mysql_pfs_key_t innodb_parallel_dblwrite_file_key;
229 #endif /* UNIV_PFS_IO */
230
231 /** The asynchronous I/O context */
232 struct Slot {
SlotSlot233 Slot() { memset(this, 0, sizeof(*this)); }
234
235 /** index of the slot in the aio array */
236 uint16_t pos;
237
238 /** true if this slot is reserved */
239 bool is_reserved;
240
241 /** time when reserved */
242 ib_time_monotonic_t reservation_time;
243
244 /** buffer used in i/o */
245 byte* buf;
246
247 /** Buffer pointer used for actual IO. We advance this
248 when partial IO is required and not buf */
249 byte* ptr;
250
251 /** OS_FILE_READ or OS_FILE_WRITE */
252 IORequest type;
253
254 /** file offset in bytes */
255 os_offset_t offset;
256
257 /** file where to read or write */
258 pfs_os_file_t file;
259
260 /** file name or path */
261 const char* name;
262
263 /** used only in simulated aio: true if the physical i/o
264 already made and only the slot message needs to be passed
265 to the caller of os_aio_simulated_handle */
266 bool io_already_done;
267
268 ulint space_id;
269
270 /** The file node for which the IO is requested. */
271 fil_node_t* m1;
272
273 /** the requester of an aio operation and which can be used
274 to identify which pending aio operation was completed */
275 void* m2;
276
277 /** AIO completion status */
278 dberr_t err;
279
280 #ifdef WIN_ASYNC_IO
281 /** handle object we need in the OVERLAPPED struct */
282 HANDLE handle;
283
284 /** Windows control block for the aio request */
285 OVERLAPPED control;
286
287 /** bytes written/read */
288 DWORD n_bytes;
289
290 /** length of the block to read or write */
291 DWORD len;
292
293 #elif defined(LINUX_NATIVE_AIO)
294 /** Linux control block for aio */
295 struct iocb control;
296
297 /** AIO return code */
298 int ret;
299
300 /** bytes written/read. */
301 ssize_t n_bytes;
302
303 /** length of the block to read or write */
304 ulint len;
305 #else
306 /** length of the block to read or write */
307 ulint len;
308
309 /** bytes written/read. */
310 ulint n_bytes;
311 #endif /* WIN_ASYNC_IO */
312
313 /** Length of the block before it was compressed */
314 uint32 original_len;
315
316 /** Buffer block for compressed pages or encrypted pages */
317 Block* buf_block;
318
319 /** true, if we shouldn't punch a hole after writing the page */
320 bool skip_punch_hole;
321
322 /** Buffer for encrypt log */
323 void* encrypt_log_buf;
324 };
325
326 /** The asynchronous i/o array structure */
327 class AIO {
328 public:
329 /** Constructor
330 @param[in] id Latch ID
331 @param[in] n_slots Number of slots to configure
332 @param[in] segments Number of segments to configure */
333 AIO(latch_id_t id, ulint n_slots, ulint segments);
334
335 /** Destructor */
336 ~AIO();
337
338 /** Initialize the instance
339 @return DB_SUCCESS or error code */
340 dberr_t init();
341
342 /** Requests for a slot in the aio array. If no slot is available, waits
343 until not_full-event becomes signaled.
344
345 @param[in,out] type IO context
346 @param[in,out] m1 message to be passed along with the AIO
347 operation
348 @param[in,out] m2 message to be passed along with the AIO
349 operation
350 @param[in] file file handle
351 @param[in] name name of the file or path as a null-terminated
352 string
353 @param[in,out] buf buffer where to read or from which to write
354 @param[in] offset file offset, where to read from or start writing
355 @param[in] len length of the block to read or write
356 @return pointer to slot */
357 Slot* reserve_slot(
358 IORequest& type,
359 fil_node_t* m1,
360 void* m2,
361 pfs_os_file_t file,
362 const char* name,
363 void* buf,
364 os_offset_t offset,
365 ulint len,
366 ulint space_id)
367 MY_ATTRIBUTE((warn_unused_result));
368
369 /** @return number of reserved slots */
370 ulint pending_io_count() const;
371
372 /** Returns a pointer to the nth slot in the aio array.
373 @param[in] index Index of the slot in the array
374 @return pointer to slot */
at(ulint i) const375 const Slot* at(ulint i) const
376 MY_ATTRIBUTE((warn_unused_result))
377 {
378 ut_a(i < m_slots.size());
379
380 return(&m_slots[i]);
381 }
382
383 /** Non const version */
at(ulint i)384 Slot* at(ulint i)
385 MY_ATTRIBUTE((warn_unused_result))
386 {
387 ut_a(i < m_slots.size());
388
389 return(&m_slots[i]);
390 }
391
392 /** Frees a slot in the AIO array, assumes caller owns the mutex.
393 @param[in,out] slot Slot to release */
394 void release(Slot* slot);
395
396 /** Frees a slot in the AIO array, assumes caller doesn't own the mutex.
397 @param[in,out] slot Slot to release */
398 void release_with_mutex(Slot* slot);
399
400 /** Prints info about the aio array.
401 @param[in,out] file Where to print */
402 void print(FILE* file);
403
404 /** @return the number of slots per segment */
slots_per_segment() const405 ulint slots_per_segment() const
406 MY_ATTRIBUTE((warn_unused_result))
407 {
408 return(m_slots.size() / m_n_segments);
409 }
410
411 /** @return accessor for n_segments */
get_n_segments() const412 ulint get_n_segments() const
413 MY_ATTRIBUTE((warn_unused_result))
414 {
415 return(m_n_segments);
416 }
417
418 #ifdef UNIV_DEBUG
419 /** @return true if the thread owns the mutex */
is_mutex_owned() const420 bool is_mutex_owned() const
421 MY_ATTRIBUTE((warn_unused_result))
422 {
423 return(mutex_own(&m_mutex));
424 }
425 #endif /* UNIV_DEBUG */
426
427 /** Acquire the mutex */
acquire() const428 void acquire() const
429 {
430 mutex_enter(&m_mutex);
431 }
432
433 /** Release the mutex */
release() const434 void release() const
435 {
436 mutex_exit(&m_mutex);
437 }
438
439 /** Write out the state to the file/stream
440 @param[in, out] file File to write to */
441 void to_file(FILE* file) const;
442
443 /** Submit buffered AIO requests on the given segment to the kernel.
444 (low level function).
445 @param[in] acquire_mutex specifies whether to lock array mutex */
446 static void os_aio_dispatch_read_array_submit_low(
447 bool acquire_mutex);
448
449 #ifdef LINUX_NATIVE_AIO
450 /** Dispatch an AIO request to the kernel.
451 @param[in,out] slot an already reserved slot
452 @param[in] should_buffer should buffer the request
453 rather than submit
454 @return true on success. */
455 bool linux_dispatch(Slot* slot, bool should_buffer)
456 MY_ATTRIBUTE((warn_unused_result));
457
458 /** Accessor for an AIO event
459 @param[in] index Index into the array
460 @return the event at the index */
io_events(ulint index)461 io_event* io_events(ulint index)
462 MY_ATTRIBUTE((warn_unused_result))
463 {
464 ut_a(index < m_events.size());
465
466 return(&m_events[index]);
467 }
468
469 /** Accessor for the AIO context
470 @param[in] segment Segment for which to get the context
471 @return the AIO context for the segment */
io_ctx(ulint segment)472 io_context* io_ctx(ulint segment)
473 MY_ATTRIBUTE((warn_unused_result))
474 {
475 ut_ad(segment < get_n_segments());
476
477 return(m_aio_ctx[segment]);
478 }
479
480 /** Creates an io_context for native linux AIO.
481 @param[in] max_events number of events
482 @param[out] io_ctx io_ctx to initialize.
483 @return true on success. */
484 static bool linux_create_io_ctx(ulint max_events, io_context_t* io_ctx)
485 MY_ATTRIBUTE((warn_unused_result));
486
487 /** Checks if the system supports native linux aio. On some kernel
488 versions where native aio is supported it won't work on tmpfs. In such
489 cases we can't use native aio as it is not possible to mix simulated
490 and native aio.
491 @return true if supported, false otherwise. */
492 static bool is_linux_native_aio_supported()
493 MY_ATTRIBUTE((warn_unused_result));
494 #endif /* LINUX_NATIVE_AIO */
495
496 #ifdef WIN_ASYNC_IO
497 /** Wakes up all async i/o threads in the array in Windows async I/O at
498 shutdown. */
signal()499 void signal()
500 {
501 for (ulint i = 0; i < m_slots.size(); ++i) {
502 SetEvent(m_slots[i].handle);
503 }
504 }
505
506 /** Wake up all AIO threads in Windows native aio */
wake_at_shutdown()507 static void wake_at_shutdown()
508 {
509 s_reads->signal();
510
511 if (s_writes != NULL) {
512 s_writes->signal();
513 }
514
515 if (s_ibuf != NULL) {
516 s_ibuf->signal();
517 }
518
519 if (s_log != NULL) {
520 s_log->signal();
521 }
522 }
523 #endif /* WIN_ASYNC_IO */
524
525 #ifdef _WIN32
526 /** This function can be called if one wants to post a batch of reads
527 and prefers an I/O - handler thread to handle them all at once later.You
528 must call os_aio_simulated_wake_handler_threads later to ensure the
529 threads are not left sleeping! */
530 static void simulated_put_read_threads_to_sleep();
531
532 /** The non asynchronous IO array.
533 @return the synchronous AIO array instance. */
sync_array()534 static AIO* sync_array()
535 MY_ATTRIBUTE((warn_unused_result))
536 {
537 return(s_sync);
538 }
539
540 /**
541 Get the AIO handles for a segment.
542 @param[in] segment The local segment.
543 @return the handles for the segment. */
handles(ulint segment)544 HANDLE* handles(ulint segment)
545 MY_ATTRIBUTE((warn_unused_result))
546 {
547 ut_ad(segment < m_handles->size() / slots_per_segment());
548
549 return(&(*m_handles)[segment * slots_per_segment()]);
550 }
551
552 /** @return true if no slots are reserved */
is_empty() const553 bool is_empty() const
554 MY_ATTRIBUTE((warn_unused_result))
555 {
556 ut_ad(is_mutex_owned());
557 return(m_n_reserved == 0);
558 }
559 #endif /* _WIN32 */
560
561 /** Create an instance using new(std::nothrow)
562 @param[in] id Latch ID
563 @param[in] n_slots The number of AIO request slots
564 @param[in] segments The number of segments
565 @return a new AIO instance */
566 static AIO* create(
567 latch_id_t id,
568 ulint n_slots,
569 ulint segments)
570 MY_ATTRIBUTE((warn_unused_result));
571
572 /** Initializes the asynchronous io system. Creates one array each
573 for ibuf and log I/O. Also creates one array each for read and write
574 where each array is divided logically into n_readers and n_writers
575 respectively. The caller must create an i/o handler thread for each
576 segment in these arrays. This function also creates the sync array.
577 No I/O handler thread needs to be created for that
578 @param[in] n_per_seg maximum number of pending aio
579 operations allowed per segment
580 @param[in] n_readers number of reader threads
581 @param[in] n_writers number of writer threads
582 @param[in] n_slots_sync number of slots in the sync aio array
583 @return true if AIO sub-system was started successfully */
584 static bool start(
585 ulint n_per_seg,
586 ulint n_readers,
587 ulint n_writers,
588 ulint n_slots_sync)
589 MY_ATTRIBUTE((warn_unused_result));
590
591 /** Free the AIO arrays */
592 static void shutdown();
593
594 /** Print all the AIO segments
595 @param[in,out] file Where to print */
596 static void print_all(FILE* file);
597
598 /** Calculates local segment number and aio array from global
599 segment number.
600 @param[out] array AIO wait array
601 @param[in] segment global segment number
602 @return local segment number within the aio array */
603 static ulint get_array_and_local_segment(
604 AIO** array,
605 ulint segment)
606 MY_ATTRIBUTE((warn_unused_result));
607
608 /** Select the IO slot array
609 @param[in] type Type of IO, READ or WRITE
610 @param[in] read_only true if running in read-only mode
611 @param[in] mode IO mode
612 @return slot array or NULL if invalid mode specified */
613 static AIO* select_slot_array(
614 IORequest& type,
615 bool read_only,
616 ulint mode)
617 MY_ATTRIBUTE((warn_unused_result));
618
619 /** Calculates segment number for a slot.
620 @param[in] array AIO wait array
621 @param[in] slot slot in this array
622 @return segment number (which is the number used by, for example,
623 I/O handler threads) */
624 static ulint get_segment_no_from_slot(
625 const AIO* array,
626 const Slot* slot)
627 MY_ATTRIBUTE((warn_unused_result));
628
629 /** Wakes up a simulated AIO I/O-handler thread if it has something
630 to do.
631 @param[in] global_segment the number of the segment in the
632 AIO arrays */
633 static void wake_simulated_handler_thread(ulint global_segment);
634
635 /** Check if it is a read request
636 @param[in] aio The AIO instance to check
637 @return true if the AIO instance is for reading. */
is_read(const AIO * aio)638 static bool is_read(const AIO* aio)
639 MY_ATTRIBUTE((warn_unused_result))
640 {
641 return(s_reads == aio);
642 }
643
644 /** Wait on an event until no pending writes */
wait_until_no_pending_writes()645 static void wait_until_no_pending_writes()
646 {
647 os_event_wait(AIO::s_writes->m_is_empty);
648 }
649
650 /** Print to file
651 @param[in] file File to write to */
652 static void print_to_file(FILE* file);
653
654 /** Check for pending IO. Gets the count and also validates the
655 data structures.
656 @return count of pending IO requests */
657 static ulint total_pending_io_count();
658
659 private:
660 /** Initialise the slots
661 @return DB_SUCCESS or error code */
662 dberr_t init_slots()
663 MY_ATTRIBUTE((warn_unused_result));
664
665 /** Wakes up a simulated AIO I/O-handler thread if it has something
666 to do for a local segment in the AIO array.
667 @param[in] global_segment the number of the segment in the
668 AIO arrays
669 @param[in] segment the local segment in the AIO array */
670 void wake_simulated_handler_thread(ulint global_segment, ulint segment);
671
672 /** Prints pending IO requests per segment of an aio array.
673 We probably don't need per segment statistics but they can help us
674 during development phase to see if the IO requests are being
675 distributed as expected.
676 @param[in,out] file file where to print
677 @param[in] segments pending IO array */
678 void print_segment_info(
679 FILE* file,
680 const ulint* segments);
681
682 #ifdef LINUX_NATIVE_AIO
683 /** Initialise the Linux native AIO data structures
684 @return DB_SUCCESS or error code */
685 dberr_t init_linux_native_aio()
686 MY_ATTRIBUTE((warn_unused_result));
687 #endif /* LINUX_NATIVE_AIO */
688
689 /** Submit buffered AIO requests on the array to the kernel.
690 (low level function).
691 @param[in] acquire_mutex specifies whether to lock array mutex
692 @param[in] array for which to submit IO */
693 static void os_aio_dispatch_read_array_submit_low_for_array(
694 bool acquire_mutex MY_ATTRIBUTE((unused)), const AIO* arr);
695
696 private:
697 typedef std::vector<Slot> Slots;
698
699 /** the mutex protecting the aio array */
700 mutable SysMutex m_mutex;
701
702 /** Pointer to the slots in the array.
703 Number of elements must be divisible by n_threads. */
704 Slots m_slots;
705
706 /** Number of segments in the aio array of pending aio requests.
707 A thread can wait separately for any one of the segments. */
708 ulint m_n_segments;
709
710 /** The event which is set to the signaled state when
711 there is space in the aio outside the ibuf segment */
712 os_event_t m_not_full;
713
714 /** The event which is set to the signaled state when
715 there are no pending i/os in this array */
716 os_event_t m_is_empty;
717
718 /** Number of reserved slots in the AIO array outside
719 the ibuf segment */
720 ulint m_n_reserved;
721
722 #ifdef _WIN32
723 typedef std::vector<HANDLE, ut_allocator<HANDLE> > Handles;
724
725 /** Pointer to an array of OS native event handles where
726 we copied the handles from slots, in the same order. This
727 can be used in WaitForMultipleObjects; used only in Windows */
728 Handles* m_handles;
729 #endif /* _WIN32 */
730
731 #if defined(LINUX_NATIVE_AIO)
732 typedef std::vector<io_event> IOEvents;
733
734 /** completion queue for IO. There is one such queue per
735 segment. Each thread will work on one ctx exclusively. */
736 io_context_t* m_aio_ctx;
737
738 /** The array to collect completed IOs. There is one such
739 event for each possible pending IO. The size of the array
740 is equal to m_slots.size(). */
741 IOEvents m_events;
742
743 /** Array to buffer the not-submitted aio requests. The array length
744 is n_slots. It is divided into n_segments segments. Pending requests
745 on each segment are buffered separately. */
746 struct iocb** m_pending;
747
748 /** Array of length n_segments. Each element counts the number of not
749 submitted aio request on that segment. */
750 ulint* m_count;
751 #endif /* LINUX_NATIV_AIO */
752
753 /** The aio arrays for non-ibuf i/o and ibuf i/o, as well as
754 sync AIO. These are NULL when the module has not yet been
755 initialized. */
756
757 /** Insert buffer */
758 static AIO* s_ibuf;
759
760 /** Redo log */
761 static AIO* s_log;
762
763 /** Reads */
764 static AIO* s_reads;
765
766 /** Writes */
767 static AIO* s_writes;
768
769 /** Synchronous I/O */
770 static AIO* s_sync;
771 };
772
773 /** Static declarations */
774 AIO* AIO::s_reads;
775 AIO* AIO::s_writes;
776 AIO* AIO::s_ibuf;
777 AIO* AIO::s_log;
778 AIO* AIO::s_sync;
779
780 #if defined(LINUX_NATIVE_AIO)
781 /** timeout for each io_getevents() call = 500ms. */
782 static const ulint OS_AIO_REAP_TIMEOUT = 500000000UL;
783
784 /** time to sleep, in microseconds if io_setup() returns EAGAIN. */
785 static const ulint OS_AIO_IO_SETUP_RETRY_SLEEP = 500000UL;
786
787 /** number of attempts before giving up on io_setup(). */
788 static const int OS_AIO_IO_SETUP_RETRY_ATTEMPTS = 5;
789 #endif /* LINUX_NATIVE_AIO */
790
791 /** Array of events used in simulated AIO */
792 static os_event_t* os_aio_segment_wait_events = NULL;
793
794 /** Number of asynchronous I/O segments. Set by os_aio_init(). */
795 static ulint os_aio_n_segments = ULINT_UNDEFINED;
796
797 /** If the following is true, read i/o handler threads try to
798 wait until a batch of new read requests have been posted */
799 static bool os_aio_recommend_sleep_for_read_threads = false;
800 #endif /* !UNIV_HOTBACKUP */
801
802 ulint os_n_file_reads = 0;
803 ulint os_bytes_read_since_printout = 0;
804 ulint os_n_file_writes = 0;
805 ulint os_n_fsyncs = 0;
806 ulint os_n_file_reads_old = 0;
807 ulint os_n_file_writes_old = 0;
808 ulint os_n_fsyncs_old = 0;
809 /** Number of pending write operations */
810 ulint os_n_pending_writes = 0;
811 /** Number of pending read operations */
812 ulint os_n_pending_reads = 0;
813
814 ib_time_monotonic_t os_last_printout;
815 bool os_has_said_disk_full = false;
816
817 /** Default Zip compression level */
818 extern uint page_zip_level;
819
820 #if DATA_TRX_ID_LEN > 6
821 #error "COMPRESSION_ALGORITHM will not fit"
822 #endif /* DATA_TRX_ID_LEN */
823
824 /** Validates the consistency of the aio system.
825 @return true if ok */
826 static
827 bool
828 os_aio_validate();
829
830 /** Does error handling when a file operation fails.
831 @param[in] name File name or NULL
832 @param[in] operation Name of operation e.g., "read", "write"
833 @return true if we should retry the operation */
834 static
835 bool
836 os_file_handle_error(
837 const char* name,
838 const char* operation);
839
840 /** Free storage space associated with a section of the file.
841 @param[in] fh Open file handle
842 @param[in] off Starting offset (SEEK_SET)
843 @param[in] len Size of the hole
844 @return DB_SUCCESS or error code */
845 dberr_t
846 os_file_punch_hole(
847 os_file_t fh,
848 os_offset_t off,
849 os_offset_t len);
850
851 /**
852 Does error handling when a file operation fails.
853 @param[in] name File name or NULL
854 @param[in] operation Name of operation e.g., "read", "write"
855 @param[in] silent if true then don't print any message to the log.
856 @return true if we should retry the operation */
857 static
858 bool
859 os_file_handle_error_no_exit(
860 const char* name,
861 const char* operation,
862 bool silent);
863
864 /** Decompress after a read and punch a hole in the file if it was a write
865 @param[in] type IO context
866 @param[in] fh Open file handle
867 @param[in,out] buf Buffer to transform
868 @param[in,out] scratch Scratch area for read decompression
869 @param[in] src_len Length of the buffer before compression
870 @param[in] len Compressed buffer length for write and size
871 of buf len for read
872 @return DB_SUCCESS or error code */
873 static
874 dberr_t
875 os_file_io_complete(
876 const IORequest&type,
877 os_file_t fh,
878 byte* buf,
879 byte* scratch,
880 ulint src_len,
881 os_offset_t offset,
882 ulint len);
883
884 /** Does simulated AIO. This function should be called by an i/o-handler
885 thread.
886
887 @param[in] segment The number of the segment in the aio arrays to wait
888 for; segment 0 is the ibuf i/o thread, segment 1 the
889 log i/o thread, then follow the non-ibuf read threads,
890 and as the last are the non-ibuf write threads
891 @param[out] m1 the messages passed with the AIO request; note that
892 also in the case where the AIO operation failed, these
893 output parameters are valid and can be used to restart
894 the operation, for example
895 @param[out] m2 Callback argument
896 @param[in] type IO context
897 @return DB_SUCCESS or error code */
898 static
899 dberr_t
900 os_aio_simulated_handler(
901 ulint global_segment,
902 fil_node_t** m1,
903 void** m2,
904 IORequest* type);
905
906 #ifdef WIN_ASYNC_IO
907 /** This function is only used in Windows asynchronous i/o.
908 Waits for an aio operation to complete. This function is used to wait the
909 for completed requests. The aio array of pending requests is divided
910 into segments. The thread specifies which segment or slot it wants to wait
911 for. NOTE: this function will also take care of freeing the aio slot,
912 therefore no other thread is allowed to do the freeing!
913 @param[in] segment The number of the segment in the aio arrays to
914 wait for; segment 0 is the ibuf I/O thread,
915 segment 1 the log I/O thread, then follow the
916 non-ibuf read threads, and as the last are the
917 non-ibuf write threads; if this is
918 ULINT_UNDEFINED, then it means that sync AIO
919 is used, and this parameter is ignored
920 @param[in] pos this parameter is used only in sync AIO:
921 wait for the aio slot at this position
922 @param[out] m1 the messages passed with the AIO request; note
923 that also in the case where the AIO operation
924 failed, these output parameters are valid and
925 can be used to restart the operation,
926 for example
927 @param[out] m2 callback message
928 @param[out] type OS_FILE_WRITE or ..._READ
929 @return DB_SUCCESS or error code */
930 static
931 dberr_t
932 os_aio_windows_handler(
933 ulint segment,
934 ulint pos,
935 fil_node_t** m1,
936 void** m2,
937 IORequest* type);
938 #endif /* WIN_ASYNC_IO */
939
940 /** Allocate a page for sync IO
941 @return pointer to page */
942 static
943 Block*
os_alloc_block()944 os_alloc_block()
945 {
946 size_t pos;
947 Blocks& blocks = *block_cache;
948 size_t i = static_cast<size_t>(my_timer_cycles());
949 const size_t size = blocks.size();
950 ulint retry = 0;
951 Block* block;
952
953 DBUG_EXECUTE_IF("os_block_cache_busy", retry = MAX_BLOCKS * 3;);
954
955 for (;;) {
956
957 /* After go through the block cache for 3 times,
958 allocate a new temporary block. */
959 if (retry == MAX_BLOCKS * 3) {
960 byte* ptr;
961
962 ptr = static_cast<byte*>(
963 ut_malloc_nokey(sizeof(*block)
964 + BUFFER_BLOCK_SIZE));
965
966 block = new (ptr) Block();
967 block->m_ptr = static_cast<byte*>(
968 ptr + sizeof(*block));
969 block->m_in_use = 1;
970
971 break;
972 }
973
974 pos = i++ % size;
975
976 if (TAS(&blocks[pos].m_in_use, 1) == 0) {
977 block = &blocks[pos];
978 break;
979 }
980
981 os_thread_yield();
982
983 ++retry;
984 }
985
986 ut_a(block->m_in_use != 0);
987
988 return(block);
989 }
990
991 /** Free a page after sync IO
992 @param[in,own] block The block to free/release */
993 static
994 void
os_free_block(Block * block)995 os_free_block(Block* block)
996 {
997 ut_ad(block->m_in_use == 1);
998
999 TAS(&block->m_in_use, 0);
1000
1001 /* When this block is not in the block cache, and it's
1002 a temporary block, we need to free it directly. */
1003 if (std::less<Block*>()(block, &block_cache->front())
1004 || std::greater<Block*>()(block, &block_cache->back())) {
1005 ut_free(block);
1006 }
1007 }
1008
1009 /** Generic AIO Handler methods. Currently handles IO post processing. */
1010 class AIOHandler {
1011 public:
1012 /** Do any post processing after a read/write
1013 @return DB_SUCCESS or error code. */
1014 static dberr_t post_io_processing(Slot* slot);
1015
1016 /** Decompress after a read and punch a hole in the file if
1017 it was a write */
io_complete(const Slot * slot)1018 static dberr_t io_complete(const Slot* slot)
1019 {
1020 ut_a(slot->offset > 0);
1021 ut_a(slot->type.is_read() || !slot->skip_punch_hole);
1022 return(os_file_io_complete(
1023 slot->type, slot->file.m_file, slot->buf,
1024 NULL, slot->original_len,
1025 slot->offset, slot->len));
1026 }
1027
1028 private:
1029 /** Check whether the page was encrypted.
1030 @param[in] slot The slot that contains the IO request
1031 @return true if it was an encyrpted page */
is_encrypted_page(const Slot * slot)1032 static bool is_encrypted_page(const Slot* slot)
1033 {
1034 return(Encryption::is_encrypted_page(slot->buf));
1035 }
1036
1037 /** Check whether the page was compressed.
1038 @param[in] slot The slot that contains the IO request
1039 @return true if it was a compressed page */
is_compressed_page(const Slot * slot)1040 static bool is_compressed_page(const Slot* slot)
1041 {
1042 const byte* src = slot->buf;
1043
1044 ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1045
1046 return(page_type == FIL_PAGE_COMPRESSED);
1047 }
1048
1049 /** Get the compressed page size.
1050 @param[in] slot The slot that contains the IO request
1051 @return number of bytes to read for a successful decompress */
compressed_page_size(const Slot * slot)1052 static ulint compressed_page_size(const Slot* slot)
1053 {
1054 ut_ad(slot->type.is_read());
1055 ut_ad(is_compressed_page(slot));
1056
1057 ulint size;
1058 const byte* src = slot->buf;
1059
1060 size = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1);
1061
1062 return(size + FIL_PAGE_DATA);
1063 }
1064
1065 /** Check if the page contents can be decompressed.
1066 @param[in] slot The slot that contains the IO request
1067 @return true if the data read has all the compressed data */
can_decompress(const Slot * slot)1068 static bool can_decompress(const Slot* slot)
1069 {
1070 ut_ad(slot->type.is_read());
1071 ut_ad(is_compressed_page(slot));
1072
1073 ulint version;
1074 const byte* src = slot->buf;
1075
1076 version = mach_read_from_1(src + FIL_PAGE_VERSION);
1077
1078 ut_a(Compression::is_valid_page_version(version));
1079
1080 /* Includes the page header size too */
1081 ulint size = compressed_page_size(slot);
1082
1083 return(size <= (slot->ptr - slot->buf) + (ulint) slot->n_bytes);
1084 }
1085
1086 /** Check if we need to read some more data.
1087 @param[in] slot The slot that contains the IO request
1088 @param[in] n_bytes Total bytes read so far
1089 @return DB_SUCCESS or error code */
1090 static dberr_t check_read(Slot* slot, ulint n_bytes);
1091 };
1092
1093 /** Helper class for doing synchronous file IO. Currently, the objective
1094 is to hide the OS specific code, so that the higher level functions aren't
1095 peppered with #ifdef. Makes the code flow difficult to follow. */
1096 class SyncFileIO {
1097 public:
1098 /** Constructor
1099 @param[in] fh File handle
1100 @param[in,out] buf Buffer to read/write
1101 @param[in] n Number of bytes to read/write
1102 @param[in] offset Offset where to read or write */
SyncFileIO(os_file_t fh,void * buf,ulint n,os_offset_t offset)1103 SyncFileIO(os_file_t fh, void* buf, ulint n, os_offset_t offset)
1104 :
1105 m_fh(fh),
1106 m_buf(buf),
1107 m_n(static_cast<ssize_t>(n)),
1108 m_offset(offset)
1109 {
1110 ut_ad(m_n > 0);
1111 }
1112
1113 /** Destructor */
~SyncFileIO()1114 ~SyncFileIO()
1115 {
1116 /* No op */
1117 }
1118
1119 /** Do the read/write
1120 @param[in] request The IO context and type
1121 @return the number of bytes read/written or negative value on error */
1122 ssize_t execute(const IORequest& request);
1123
1124 /** Do the read/write
1125 @param[in,out] slot The IO slot, it has the IO context
1126 @return the number of bytes read/written or negative value on error */
1127 static ssize_t execute(Slot* slot);
1128
1129 /** Move the read/write offset up to where the partial IO succeeded.
1130 @param[in] n_bytes The number of bytes to advance */
advance(ssize_t n_bytes)1131 void advance(ssize_t n_bytes)
1132 {
1133 m_offset += n_bytes;
1134
1135 ut_ad(m_n >= n_bytes);
1136
1137 m_n -= n_bytes;
1138
1139 m_buf = reinterpret_cast<uchar*>(m_buf) + n_bytes;
1140 }
1141
1142 private:
1143 /** Open file handle */
1144 os_file_t m_fh;
1145
1146 /** Buffer to read/write */
1147 void* m_buf;
1148
1149 /** Number of bytes to read/write */
1150 ssize_t m_n;
1151
1152 /** Offset from where to read/write */
1153 os_offset_t m_offset;
1154 };
1155
1156 /** If it is a compressed page return the compressed page data + footer size
1157 @param[in] buf Buffer to check, must include header + 10 bytes
1158 @return ULINT_UNDEFINED if the page is not a compressed page or length
1159 of the compressed data (including footer) if it is a compressed page */
1160 ulint
os_file_compressed_page_size(const byte * buf)1161 os_file_compressed_page_size(const byte* buf)
1162 {
1163 ulint type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1164
1165 if (type == FIL_PAGE_COMPRESSED) {
1166 ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1167 ut_a(Compression::is_valid_page_version(version));
1168 return(mach_read_from_2(buf + FIL_PAGE_COMPRESS_SIZE_V1));
1169 }
1170
1171 return(ULINT_UNDEFINED);
1172 }
1173
1174 /** If it is a compressed page return the original page data + footer size
1175 @param[in] buf Buffer to check, must include header + 10 bytes
1176 @return ULINT_UNDEFINED if the page is not a compressed page or length
1177 of the original data + footer if it is a compressed page */
1178 ulint
os_file_original_page_size(const byte * buf)1179 os_file_original_page_size(const byte* buf)
1180 {
1181 ulint type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1182
1183 if (type == FIL_PAGE_COMPRESSED) {
1184
1185 ulint version = mach_read_from_1(buf + FIL_PAGE_VERSION);
1186 ut_a(Compression::is_valid_page_version(version));
1187
1188 return(mach_read_from_2(buf + FIL_PAGE_ORIGINAL_SIZE_V1));
1189 }
1190
1191 return(ULINT_UNDEFINED);
1192 }
1193
1194 /** Check if we need to read some more data.
1195 @param[in] slot The slot that contains the IO request
1196 @param[in] n_bytes Total bytes read so far
1197 @return DB_SUCCESS or error code */
1198 dberr_t
check_read(Slot * slot,ulint n_bytes)1199 AIOHandler::check_read(Slot* slot, ulint n_bytes)
1200 {
1201 dberr_t err;
1202
1203 ut_ad(slot->type.is_read());
1204 ut_ad(slot->original_len > slot->len);
1205
1206 if (is_compressed_page(slot)) {
1207
1208 if (can_decompress(slot)) {
1209
1210 ut_a(slot->offset > 0);
1211
1212 slot->len = slot->original_len;
1213 #ifdef _WIN32
1214 slot->n_bytes = static_cast<DWORD>(n_bytes);
1215 #else
1216 slot->n_bytes = static_cast<ulint>(n_bytes);
1217 #endif /* _WIN32 */
1218
1219 err = io_complete(slot);
1220 ut_a(err == DB_SUCCESS);
1221 } else {
1222 /* Read the next block in */
1223 ut_ad(compressed_page_size(slot) >= n_bytes);
1224
1225 err = DB_FAIL;
1226 }
1227 } else if (is_encrypted_page(slot)
1228 || (slot->type.is_log()
1229 && slot->offset >= LOG_FILE_HDR_SIZE)) {
1230 ut_a(slot->offset > 0);
1231
1232 slot->len = slot->original_len;
1233 #ifdef _WIN32
1234 slot->n_bytes = static_cast<DWORD>(n_bytes);
1235 #else
1236 slot->n_bytes = static_cast<ulint>(n_bytes);
1237 #endif /* _WIN32 */
1238
1239 err = io_complete(slot);
1240 ut_a(err == DB_SUCCESS);
1241
1242 } else {
1243 err = DB_FAIL;
1244 }
1245
1246 if (slot->buf_block != NULL) {
1247 os_free_block(slot->buf_block);
1248 slot->buf_block = NULL;
1249 }
1250
1251 if (slot->encrypt_log_buf != NULL) {
1252 ut_free(slot->encrypt_log_buf);
1253 slot->encrypt_log_buf = NULL;
1254 }
1255
1256 return(err);
1257 }
1258
1259 /** Do any post processing after a read/write
1260 @return DB_SUCCESS or error code. */
1261 dberr_t
post_io_processing(Slot * slot)1262 AIOHandler::post_io_processing(Slot* slot)
1263 {
1264 dberr_t err;
1265
1266 ut_ad(slot->is_reserved);
1267
1268 /* Total bytes read so far */
1269 ulint n_bytes = (slot->ptr - slot->buf) + slot->n_bytes;
1270
1271 /* Compressed writes can be smaller than the original length.
1272 Therefore they can be processed without further IO. */
1273 if (n_bytes == slot->original_len
1274 || (slot->type.is_write()
1275 && slot->type.is_compressed()
1276 && slot->len == static_cast<ulint>(slot->n_bytes))) {
1277
1278 if ((slot->type.is_log() && slot->offset >= LOG_FILE_HDR_SIZE)
1279 || is_compressed_page(slot) || is_encrypted_page(slot)) {
1280
1281 ut_a(slot->offset > 0);
1282
1283 if (slot->type.is_read()) {
1284 slot->len = slot->original_len;
1285 }
1286
1287 /* The punch hole has been done on collect() */
1288
1289 if (slot->type.is_read()) {
1290 err = io_complete(slot);
1291 } else {
1292 err = DB_SUCCESS;
1293 }
1294
1295 ut_ad(err == DB_SUCCESS
1296 || err == DB_UNSUPPORTED
1297 || err == DB_CORRUPTION
1298 || err == DB_IO_DECOMPRESS_FAIL);
1299 } else if (!slot->type.is_log() && slot->type.is_read() && Encryption::can_page_be_keyring_encrypted(slot->buf)
1300 && !slot->type.is_encryption_disabled()) {
1301 ut_ad(is_encrypted_page(slot) == false);
1302 // we did not go to io_complete - so mark read page as unencrypted here
1303 mach_write_to_4(slot->buf + FIL_PAGE_ENCRYPTION_KEY_VERSION, ENCRYPTION_KEY_VERSION_NOT_ENCRYPTED);
1304 err = DB_SUCCESS;
1305 }
1306 else {
1307
1308 err = DB_SUCCESS;
1309 }
1310
1311 if (slot->buf_block != NULL) {
1312 os_free_block(slot->buf_block);
1313 slot->buf_block = NULL;
1314 }
1315
1316 if (slot->encrypt_log_buf != NULL) {
1317 ut_free(slot->encrypt_log_buf);
1318 slot->encrypt_log_buf = NULL;
1319 }
1320 } else if ((ulint) slot->n_bytes == (ulint) slot->len) {
1321
1322 /* It *must* be a partial read. */
1323 ut_ad(slot->len < slot->original_len);
1324
1325 /* Has to be a read request, if it is less than
1326 the original length. */
1327 ut_ad(slot->type.is_read());
1328 err = check_read(slot, n_bytes);
1329
1330 } else {
1331 err = DB_FAIL;
1332 }
1333
1334 return(err);
1335 }
1336
1337 /** Count the number of free slots
1338 @return number of reserved slots */
1339 ulint
pending_io_count() const1340 AIO::pending_io_count() const
1341 {
1342 acquire();
1343
1344 #ifdef UNIV_DEBUG
1345 ut_a(m_n_segments > 0);
1346 ut_a(!m_slots.empty());
1347
1348 ulint count = 0;
1349
1350 for (ulint i = 0; i < m_slots.size(); ++i) {
1351
1352 const Slot& slot = m_slots[i];
1353
1354 if (slot.is_reserved) {
1355 ++count;
1356 ut_a(slot.len > 0);
1357 }
1358 }
1359
1360 ut_a(m_n_reserved == count);
1361 #endif /* UNIV_DEBUG */
1362
1363 ulint reserved = m_n_reserved;
1364
1365 release();
1366
1367 return(reserved);
1368 }
1369
1370 /** Compress a data page
1371 #param[in] block_size File system block size
1372 @param[in] src Source contents to compress
1373 @param[in] src_len Length in bytes of the source
1374 @param[out] dst Compressed page contents
1375 @param[out] dst_len Length in bytes of dst contents
1376 @return buffer data, dst_len will have the length of the data */
1377 static
1378 byte*
os_file_compress_page(Compression compression,ulint block_size,byte * src,ulint src_len,byte * dst,ulint * dst_len,bool will_be_encrypted_with_keyring)1379 os_file_compress_page(
1380 Compression compression,
1381 ulint block_size,
1382 byte* src,
1383 ulint src_len,
1384 byte* dst,
1385 ulint* dst_len,
1386 bool will_be_encrypted_with_keyring)
1387 {
1388 ulint len = 0;
1389 ulint compression_level = page_zip_level;
1390 ulint page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
1391
1392 /* The page size must be a multiple of the OS punch hole size. */
1393 ut_ad(!(src_len % block_size));
1394
1395 /* Shouldn't compress an already compressed page. */
1396 ut_ad(page_type != FIL_PAGE_COMPRESSED);
1397
1398 /* The page must be at least twice as large as the file system
1399 block size if we are to save any space. Ignore R-Tree pages for now,
1400 they repurpose the same 8 bytes in the page header. No point in
1401 compressing if the file system block size >= our page size. */
1402
1403 if (page_type == FIL_PAGE_RTREE
1404 || block_size == ULINT_UNDEFINED
1405 || compression.m_type == Compression::NONE
1406 || src_len < block_size * 2) {
1407
1408 *dst_len = src_len;
1409
1410 return(src);
1411 }
1412
1413 /* Leave the header alone when compressing. */
1414 ut_ad(block_size >= FIL_PAGE_DATA * 2);
1415
1416 ut_ad(src_len > FIL_PAGE_DATA + block_size);
1417
1418 /* Must compress to <= N-1 FS blocks. */
1419 /* There need to be at least 4 bytes for key version and 4 bytes for post encryption
1420 checksum */
1421 ulint out_len = src_len - (FIL_PAGE_DATA + block_size + ((will_be_encrypted_with_keyring) ? 8 : 0));
1422
1423 /* This is the original data page size - the page header. */
1424 ulint content_len = src_len - FIL_PAGE_DATA;
1425
1426 ut_ad(out_len >= block_size - FIL_PAGE_DATA + ((will_be_encrypted_with_keyring) ? 8 : 0));
1427 ut_ad(out_len <= src_len - (block_size + FIL_PAGE_DATA + (will_be_encrypted_with_keyring ? 8 : 0)));
1428
1429 /* Only compress the data + trailer, leave the header alone */
1430
1431 switch (compression.m_type) {
1432 case Compression::NONE:
1433 ut_error;
1434
1435 case Compression::ZLIB: {
1436
1437 uLongf zlen = static_cast<uLongf>(out_len);
1438
1439 if (compress2(
1440 dst + FIL_PAGE_DATA,
1441 &zlen,
1442 src + FIL_PAGE_DATA,
1443 static_cast<uLong>(content_len),
1444 static_cast<int>(compression_level)) != Z_OK) {
1445
1446 *dst_len = src_len;
1447
1448 return(src);
1449 }
1450
1451 len = static_cast<ulint>(zlen);
1452
1453 break;
1454 }
1455
1456 case Compression::LZ4:
1457
1458 len = LZ4_compress_default(
1459 reinterpret_cast<char*>(src) + FIL_PAGE_DATA,
1460 reinterpret_cast<char*>(dst) + FIL_PAGE_DATA,
1461 static_cast<int>(content_len),
1462 static_cast<int>(out_len));
1463
1464 ut_a(len <= src_len - FIL_PAGE_DATA);
1465
1466 if (len == 0 || len >= out_len) {
1467
1468 *dst_len = src_len;
1469
1470 return(src);
1471 }
1472
1473 break;
1474
1475 default:
1476 *dst_len = src_len;
1477 return(src);
1478 }
1479
1480 ut_a(len <= out_len);
1481
1482 ut_ad(memcmp(src + FIL_PAGE_LSN + 4,
1483 src + src_len - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)
1484 == 0);
1485
1486 /* Copy the header as is. */
1487 memmove(dst, src, FIL_PAGE_DATA);
1488
1489 /* Add compression control information. Required for decompressing. */
1490 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
1491
1492 mach_write_to_1(dst + FIL_PAGE_VERSION, Compression::FIL_PAGE_VERSION_2);
1493
1494 mach_write_to_1(dst + FIL_PAGE_ALGORITHM_V1, compression.m_type);
1495
1496 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
1497
1498 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_SIZE_V1, content_len);
1499
1500 mach_write_to_2(dst + FIL_PAGE_COMPRESS_SIZE_V1, len);
1501
1502 /* Round to the next full block size */
1503
1504 len += FIL_PAGE_DATA;
1505
1506 if (will_be_encrypted_with_keyring) {
1507 mach_write_to_8(dst + len, 0);
1508 len += 8;
1509 }
1510
1511 // For encryption with keyring keys we required that there will be at least 8 bytes left
1512 // 4 bytes for key version and 4 bytes for post encryption checksum
1513 *dst_len = ut_calc_align(len, block_size);
1514
1515 ut_ad(*dst_len >= len && *dst_len <= out_len + FIL_PAGE_DATA + (will_be_encrypted_with_keyring ? 8 : 0));
1516
1517 /* Clear out the unused portion of the page. */
1518 if (len % block_size) {
1519 memset(dst + len, 0x0, block_size - (len % block_size));
1520 }
1521
1522 return(dst);
1523 }
1524
1525 #ifdef UNIV_DEBUG
1526 # ifndef UNIV_HOTBACKUP
1527 /** Validates the consistency the aio system some of the time.
1528 @return true if ok or the check was skipped */
1529 bool
os_aio_validate_skip()1530 os_aio_validate_skip()
1531 {
1532 /** Try os_aio_validate() every this many times */
1533 # define OS_AIO_VALIDATE_SKIP 13
1534
1535 /** The os_aio_validate() call skip counter.
1536 Use a signed type because of the race condition below. */
1537 static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1538
1539 /* There is a race condition below, but it does not matter,
1540 because this call is only for heuristic purposes. We want to
1541 reduce the call frequency of the costly os_aio_validate()
1542 check in debug builds. */
1543 --os_aio_validate_count;
1544
1545 if (os_aio_validate_count > 0) {
1546 return(true);
1547 }
1548
1549 os_aio_validate_count = OS_AIO_VALIDATE_SKIP;
1550 return(os_aio_validate());
1551 }
1552 # endif /* !UNIV_HOTBACKUP */
1553 #endif /* UNIV_DEBUG */
1554
1555 #undef USE_FILE_LOCK
1556 #define USE_FILE_LOCK
1557 #if defined(UNIV_HOTBACKUP) || defined(_WIN32)
1558 /* InnoDB Hot Backup does not lock the data files.
1559 * On Windows, mandatory locking is used.
1560 */
1561 # undef USE_FILE_LOCK
1562 #endif
1563 #ifdef USE_FILE_LOCK
1564 /** Obtain an exclusive lock on a file.
1565 @param[in] fd file descriptor
1566 @param[in] name file name
1567 @return 0 on success */
1568 static
1569 int
os_file_lock(int fd,const char * name)1570 os_file_lock(
1571 int fd,
1572 const char* name)
1573 {
1574 struct flock lk;
1575
1576 lk.l_type = F_WRLCK;
1577 lk.l_whence = SEEK_SET;
1578 lk.l_start = lk.l_len = 0;
1579
1580 if (fcntl(fd, F_SETLK, &lk) == -1) {
1581
1582 ib::error()
1583 << "Unable to lock " << name
1584 << " error: " << errno;
1585
1586 if (errno == EAGAIN || errno == EACCES) {
1587
1588 ib::info()
1589 << "Check that you do not already have"
1590 " another mysqld process using the"
1591 " same InnoDB data or log files.";
1592 }
1593
1594 return(-1);
1595 }
1596
1597 return(0);
1598 }
1599 #endif /* USE_FILE_LOCK */
1600
1601 #ifndef UNIV_HOTBACKUP
1602
1603 /** Calculates local segment number and aio array from global segment number.
1604 @param[out] array aio wait array
1605 @param[in] segment global segment number
1606 @return local segment number within the aio array */
1607 ulint
get_array_and_local_segment(AIO ** array,ulint segment)1608 AIO::get_array_and_local_segment(
1609 AIO** array,
1610 ulint segment)
1611 {
1612 ulint local_segment;
1613 ulint n_extra_segs = (srv_read_only_mode) ? 0 : 2;
1614
1615 ut_a(segment < os_aio_n_segments);
1616
1617 if (!srv_read_only_mode && segment < n_extra_segs) {
1618
1619 /* We don't support ibuf/log IO during read only mode. */
1620
1621 if (segment == IO_IBUF_SEGMENT) {
1622
1623 *array = s_ibuf;
1624
1625 } else if (segment == IO_LOG_SEGMENT) {
1626
1627 *array = s_log;
1628
1629 } else {
1630 *array = NULL;
1631 }
1632
1633 local_segment = 0;
1634
1635 } else if (segment < s_reads->m_n_segments + n_extra_segs) {
1636
1637 *array = s_reads;
1638 local_segment = segment - n_extra_segs;
1639
1640 } else {
1641 *array = s_writes;
1642
1643 local_segment = segment
1644 - (s_reads->m_n_segments + n_extra_segs);
1645 }
1646
1647 return(local_segment);
1648 }
1649
1650 /** Frees a slot in the aio array. Assumes caller owns the mutex.
1651 @param[in,out] slot Slot to release */
1652 void
release(Slot * slot)1653 AIO::release(Slot* slot)
1654 {
1655 ut_ad(is_mutex_owned());
1656
1657 ut_ad(slot->is_reserved);
1658
1659 slot->is_reserved = false;
1660
1661 --m_n_reserved;
1662
1663 if (m_n_reserved == m_slots.size() - 1) {
1664 os_event_set(m_not_full);
1665 }
1666
1667 if (m_n_reserved == 0) {
1668 os_event_set(m_is_empty);
1669 }
1670
1671 #ifdef WIN_ASYNC_IO
1672
1673 ResetEvent(slot->handle);
1674
1675 #elif defined(LINUX_NATIVE_AIO)
1676
1677 if (srv_use_native_aio) {
1678 memset(&slot->control, 0x0, sizeof(slot->control));
1679 slot->ret = 0;
1680 slot->n_bytes = 0;
1681 } else {
1682 /* These fields should not be used if we are not
1683 using native AIO. */
1684 ut_ad(slot->n_bytes == 0);
1685 ut_ad(slot->ret == 0);
1686 }
1687
1688 #endif /* WIN_ASYNC_IO */
1689 }
1690
1691 /** Frees a slot in the AIO array. Assumes caller doesn't own the mutex.
1692 @param[in,out] slot Slot to release */
1693 void
release_with_mutex(Slot * slot)1694 AIO::release_with_mutex(Slot* slot)
1695 {
1696 acquire();
1697
1698 release(slot);
1699
1700 release();
1701 }
1702
1703 /** Creates a temporary file. This function is like tmpfile(3), but
1704 the temporary file is created in the given parameter path. If the path
1705 is NULL then it will create the file in the MySQL server configuration
1706 parameter (--tmpdir).
1707 @param[in] path location for creating temporary file
1708 @return temporary file handle, or NULL on error */
1709 FILE*
os_file_create_tmpfile(const char * path)1710 os_file_create_tmpfile(
1711 const char* path)
1712 {
1713 FILE* file = NULL;
1714 int fd = innobase_mysql_tmpfile(path);
1715
1716 if (fd >= 0) {
1717 file = fdopen(fd, "w+b");
1718 }
1719
1720 if (file == NULL) {
1721
1722 ib::error()
1723 << "Unable to create temporary file; errno: "
1724 << errno;
1725
1726 if (fd >= 0) {
1727 close(fd);
1728 }
1729 }
1730
1731 return(file);
1732 }
1733
1734 /** Rewind file to its start, read at most size - 1 bytes from it to str, and
1735 NUL-terminate str. All errors are silently ignored. This function is
1736 mostly meant to be used with temporary files.
1737 @param[in,out] file File to read from
1738 @param[in,out] str Buffer where to read
1739 @param[in] size Size of buffer */
1740 void
os_file_read_string(FILE * file,char * str,ulint size)1741 os_file_read_string(
1742 FILE* file,
1743 char* str,
1744 ulint size)
1745 {
1746 if (size != 0) {
1747 rewind(file);
1748
1749 size_t flen = fread(str, 1, size - 1, file);
1750
1751 str[flen] = '\0';
1752 }
1753 }
1754
1755 static
1756 dberr_t
verify_post_encryption_checksum(const IORequest & type,Encryption & encryption,byte * buf,ulint src_len)1757 verify_post_encryption_checksum(const IORequest &type, Encryption &encryption,
1758 byte *buf, ulint src_len)
1759 {
1760 bool is_crypt_checksum_correct = false; // For MK encryption is_crypt_checksum_correct stays false
1761 ulint original_type = static_cast<uint16_t>(
1762 mach_read_from_2(buf + FIL_PAGE_ORIGINAL_TYPE_V1));
1763
1764 if (encryption.m_type == Encryption::KEYRING && Encryption::can_page_be_keyring_encrypted(original_type)) {
1765 if (type.is_page_zip_compressed()) {
1766 byte zip_magic[ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC_LEN];
1767 memcpy(zip_magic, buf + FIL_PAGE_ZIP_KEYRING_ENCRYPTION_MAGIC,
1768 ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC_LEN);
1769 is_crypt_checksum_correct = memcmp(zip_magic, ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC,
1770 ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC_LEN) == 0;
1771 } else
1772 is_crypt_checksum_correct = fil_space_verify_crypt_checksum(buf, src_len, type.is_page_zip_compressed(),
1773 encryption.is_encrypted_and_compressed(buf));
1774
1775 if (encryption.m_encryption_rotation == Encryption::NO_ROTATION && !is_crypt_checksum_correct) { // There is no re-encryption going on
1776 ulint space_id = mach_read_from_4(buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
1777 ulint page_no = mach_read_from_4(buf + FIL_PAGE_OFFSET);
1778 ib::error() << "Post - encryption checksum verification failed - decryption failed for space id = " << space_id
1779 << " page_no = " << page_no;
1780
1781 return (DB_IO_DECRYPT_FAIL);
1782 }
1783 }
1784
1785 if (encryption.m_encryption_rotation == Encryption::MASTER_KEY_TO_KEYRING) { // There is re-encryption going on
1786 encryption.m_type = is_crypt_checksum_correct
1787 ? Encryption::KEYRING // assume page is RK encrypted
1788 : Encryption::AES; // assume page is MK encrypted
1789 }
1790
1791 return DB_SUCCESS;
1792 }
1793
1794 static
1795 void
assing_key_version(byte * buf,Encryption & encryption,bool is_page_encrypted)1796 assing_key_version(
1797 byte* buf,
1798 Encryption &encryption,
1799 bool is_page_encrypted)
1800 {
1801 if (is_page_encrypted && encryption.m_type == Encryption::KEYRING)
1802 {
1803 mach_write_to_2(buf + FIL_PAGE_ORIGINAL_TYPE_V1, FIL_PAGE_ENCRYPTED);
1804 ut_ad(encryption.m_key_version != ENCRYPTION_KEY_VERSION_NOT_ENCRYPTED);
1805 mach_write_to_4(buf + FIL_PAGE_ENCRYPTION_KEY_VERSION, encryption.m_key_version);
1806 }
1807 else
1808 mach_write_to_4(buf + FIL_PAGE_ENCRYPTION_KEY_VERSION, ENCRYPTION_KEY_VERSION_NOT_ENCRYPTED);
1809 }
1810
1811 static
1812 bool
load_key_needed_for_decryption(const IORequest & type,Encryption & encryption,byte * buf)1813 load_key_needed_for_decryption(
1814 const IORequest& type,
1815 Encryption &encryption,
1816 byte *buf)
1817 {
1818 if (encryption.m_type == Encryption::KEYRING)
1819 {
1820 ulint key_version_read_from_page = ENCRYPTION_KEY_VERSION_INVALID;
1821 ulint page_type = mach_read_from_2(buf + FIL_PAGE_TYPE);
1822 if (page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED)
1823 key_version_read_from_page= mach_read_from_4(buf + FIL_PAGE_DATA + 4);
1824 else
1825 {
1826 ut_ad(page_type == FIL_PAGE_ENCRYPTED);
1827 key_version_read_from_page= mach_read_from_4(buf + FIL_PAGE_ENCRYPTION_KEY_VERSION);
1828 }
1829
1830 ut_ad(key_version_read_from_page != ENCRYPTION_KEY_VERSION_INVALID);
1831 ut_ad(key_version_read_from_page != ENCRYPTION_KEY_VERSION_NOT_ENCRYPTED);
1832
1833 // in rare cases - when (re-)encryption was aborted there can be pages encrypted with
1834 // different key versions in a given tablespace - retrieve needed key here
1835
1836 byte *key_read;
1837
1838 size_t key_len;
1839 if (Encryption::get_tablespace_key(encryption.m_key_id,
1840 key_version_read_from_page,
1841 &key_read, &key_len) == false)
1842 {
1843 return false;
1844 ut_ad(0);
1845 }
1846
1847 //For test
1848 if (key_version_read_from_page == encryption.m_key_version) {
1849 ut_ad(memcmp(key_read, encryption.m_key, key_len) == 0);
1850 }
1851
1852 // TODO: Allocated or not depends on whether key was taken from cache or keyring
1853 encryption.set_key(key_read, static_cast<ulint>(key_len), true);
1854 //encryption.m_key = key_read;
1855 //******
1856
1857 //encryption.m_klen = static_cast<ulint>(key_len);
1858 encryption.m_key_version = key_version_read_from_page;
1859 //encryption.m_free_key_on_delete= true; // we own the key
1860 }
1861 else {
1862 ut_ad(encryption.m_type == Encryption::AES);
1863 if (encryption.m_encryption_rotation == Encryption::NO_ROTATION)
1864 return true; // we are all set - needed key was alread loaded into encryption module
1865
1866 ut_ad(encryption.m_encryption_rotation == Encryption::MASTER_KEY_TO_KEYRING);
1867 ut_ad(encryption.m_tablespace_iv != NULL);
1868 encryption.m_iv = encryption.m_tablespace_iv; // iv comes from tablespace header for MK encryption
1869 ut_ad(encryption.m_tablespace_key != NULL);
1870 encryption.set_key(encryption.m_tablespace_key,
1871 ENCRYPTION_KEY_LEN, false);
1872 }
1873
1874 return true;
1875 }
1876
1877 /** Decompress after a read and punch a hole in the file if it was a write
1878 @param[in] type IO context
1879 @param[in] fh Open file handle
1880 @param[in,out] buf Buffer to transform
1881 @param[in,out] scratch Scratch area for read decompression
1882 @param[in] src_len Length of the buffer before compression
1883 @param[in] len Used buffer length for write and output
1884 buf len for read
1885 @return DB_SUCCESS or error code */
1886 static
1887 dberr_t
os_file_io_complete(const IORequest & type,os_file_t fh,byte * buf,byte * scratch,ulint src_len,os_offset_t offset,ulint len)1888 os_file_io_complete(
1889 const IORequest&type,
1890 os_file_t fh,
1891 byte* buf,
1892 byte* scratch,
1893 ulint src_len,
1894 os_offset_t offset,
1895 ulint len)
1896 {
1897 dberr_t ret = DB_SUCCESS;
1898
1899 /* We never compress/decompress the first page */
1900 ut_a(offset > 0);
1901 ut_ad(type.validate());
1902
1903 if (!type.is_compression_enabled()) {
1904 if (type.is_log() && offset >= LOG_FILE_HDR_SIZE
1905 && !type.is_encryption_disabled()) {
1906 Encryption encryption(type.encryption_algorithm());
1907
1908 ret = encryption.decrypt_log(type, buf, src_len,
1909 scratch, len);
1910 }
1911
1912 return(ret);
1913 } else if (type.is_read()) {
1914 Encryption encryption(type.encryption_algorithm());
1915
1916 bool is_page_encrypted= type.is_encryption_disabled()
1917 ? false
1918 : encryption.is_encrypted_page(buf);
1919
1920 if (is_page_encrypted)
1921 {
1922 dberr_t err = verify_post_encryption_checksum(type, encryption, buf, src_len);
1923 if (err != DB_SUCCESS)
1924 return err;
1925
1926 if (!load_key_needed_for_decryption(type, encryption, buf))
1927 return DB_DECRYPTION_FAILED;
1928
1929 ret = encryption.decrypt(type, buf, src_len, scratch, len);
1930 if (ret != DB_SUCCESS)
1931 return ret;
1932 }
1933
1934 ret = os_file_decompress_page(type.is_dblwr_recover(),
1935 buf, scratch, len);
1936 if (ret != DB_SUCCESS)
1937 return ret;
1938 if (Encryption::can_page_be_keyring_encrypted(buf) && !type.is_encryption_disabled())
1939 assing_key_version(buf, encryption, is_page_encrypted); // is_page_encrypted meaning page was encrypted before calling decrypt
1940
1941
1942 } else if (type.punch_hole()) {
1943
1944 ut_ad(len <= src_len);
1945 ut_ad(!type.is_log());
1946 ut_ad(type.is_write());
1947 ut_ad(type.is_compressed());
1948
1949 /* Nothing to do. */
1950 if (len == src_len) {
1951 return(DB_SUCCESS);
1952 }
1953
1954 #ifdef UNIV_DEBUG
1955 const ulint block_size = type.block_size();
1956 #endif /* UNIV_DEBUG */
1957
1958 /* We don't support multiple page sizes in the server
1959 at the moment. */
1960 ut_ad(src_len == srv_page_size);
1961
1962 /* Must be a multiple of the compression unit size. */
1963 ut_ad((len % block_size) == 0);
1964 ut_ad((offset % block_size) == 0);
1965
1966 ut_ad(len + block_size <= src_len);
1967
1968 offset += len;
1969
1970 return(os_file_punch_hole(fh, offset, src_len - len));
1971 }
1972 #ifdef UNIV_DEBUG
1973 if (type.is_write() && type.encryption_algorithm().m_type == Encryption::KEYRING) {
1974 Encryption encryption(type.encryption_algorithm());
1975 bool was_page_encrypted= encryption.is_encrypted_page(buf);
1976
1977 //TODO:Robert czy bez type.is_page_zip_compressed to działa - powinno
1978 ut_ad(!was_page_encrypted || //!type.is_page_zip_compressed() ||
1979 fil_space_verify_crypt_checksum(buf, src_len, type.is_page_zip_compressed(), encryption.is_encrypted_and_compressed(buf)));
1980 }
1981 #endif
1982
1983 ut_ad(!type.is_log());
1984
1985 return(DB_SUCCESS);
1986 }
1987
1988 #endif /* !UNIV_HOTBACKUP */
1989
1990 /** This function returns a new path name after replacing the basename
1991 in an old path with a new basename. The old_path is a full path
1992 name including the extension. The tablename is in the normal
1993 form "databasename/tablename". The new base name is found after
1994 the forward slash. Both input strings are null terminated.
1995
1996 This function allocates memory to be returned. It is the callers
1997 responsibility to free the return value after it is no longer needed.
1998
1999 @param[in] old_path Pathname
2000 @param[in] tablename Contains new base name
2001 @return own: new full pathname */
2002 char*
os_file_make_new_pathname(const char * old_path,const char * tablename)2003 os_file_make_new_pathname(
2004 const char* old_path,
2005 const char* tablename)
2006 {
2007 ulint dir_len;
2008 char* last_slash;
2009 char* base_name;
2010 char* new_path;
2011 ulint new_path_len;
2012
2013 /* Split the tablename into its database and table name components.
2014 They are separated by a '/'. */
2015 last_slash = strrchr((char*) tablename, '/');
2016 base_name = last_slash ? last_slash + 1 : (char*) tablename;
2017
2018 /* Find the offset of the last slash. We will strip off the
2019 old basename.ibd which starts after that slash. */
2020 last_slash = strrchr((char*) old_path, OS_PATH_SEPARATOR);
2021 dir_len = last_slash ? last_slash - old_path : strlen(old_path);
2022
2023 /* allocate a new path and move the old directory path to it. */
2024 new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd";
2025 new_path = static_cast<char*>(ut_malloc_nokey(new_path_len));
2026 memcpy(new_path, old_path, dir_len);
2027
2028 ut_snprintf(new_path + dir_len,
2029 new_path_len - dir_len,
2030 "%c%s.ibd",
2031 OS_PATH_SEPARATOR,
2032 base_name);
2033
2034 return(new_path);
2035 }
2036
2037 /** This function reduces a null-terminated full remote path name into
2038 the path that is sent by MySQL for DATA DIRECTORY clause. It replaces
2039 the 'databasename/tablename.ibd' found at the end of the path with just
2040 'tablename'.
2041
2042 Since the result is always smaller than the path sent in, no new memory
2043 is allocated. The caller should allocate memory for the path sent in.
2044 This function manipulates that path in place.
2045
2046 If the path format is not as expected, just return. The result is used
2047 to inform a SHOW CREATE TABLE command.
2048 @param[in,out] data_dir_path Full path/data_dir_path */
2049 void
os_file_make_data_dir_path(char * data_dir_path)2050 os_file_make_data_dir_path(
2051 char* data_dir_path)
2052 {
2053 /* Replace the period before the extension with a null byte. */
2054 char* ptr = strrchr((char*) data_dir_path, '.');
2055
2056 if (ptr == NULL) {
2057 return;
2058 }
2059
2060 ptr[0] = '\0';
2061
2062 /* The tablename starts after the last slash. */
2063 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
2064
2065 if (ptr == NULL) {
2066 return;
2067 }
2068
2069 ptr[0] = '\0';
2070
2071 char* tablename = ptr + 1;
2072
2073 /* The databasename starts after the next to last slash. */
2074 ptr = strrchr((char*) data_dir_path, OS_PATH_SEPARATOR);
2075
2076 if (ptr == NULL) {
2077 return;
2078 }
2079
2080 ulint tablename_len = ut_strlen(tablename);
2081
2082 ut_memmove(++ptr, tablename, tablename_len);
2083
2084 ptr[tablename_len] = '\0';
2085 }
2086
2087 /** Check if the path refers to the root of a drive using a pointer
2088 to the last directory separator that the caller has fixed.
2089 @param[in] path path name
2090 @param[in] path last directory separator in the path
2091 @return true if this path is a drive root, false if not */
2092 UNIV_INLINE
2093 bool
os_file_is_root(const char * path,const char * last_slash)2094 os_file_is_root(
2095 const char* path,
2096 const char* last_slash)
2097 {
2098 return(
2099 #ifdef _WIN32
2100 (last_slash == path + 2 && path[1] == ':') ||
2101 #endif /* _WIN32 */
2102 last_slash == path);
2103 }
2104
2105 /** Return the parent directory component of a null-terminated path.
2106 Return a new buffer containing the string up to, but not including,
2107 the final component of the path.
2108 The path returned will not contain a trailing separator.
2109 Do not return a root path, return NULL instead.
2110 The final component trimmed off may be a filename or a directory name.
2111 If the final component is the only component of the path, return NULL.
2112 It is the caller's responsibility to free the returned string after it
2113 is no longer needed.
2114 @param[in] path Path name
2115 @return own: parent directory of the path */
2116 static
2117 char*
os_file_get_parent_dir(const char * path)2118 os_file_get_parent_dir(
2119 const char* path)
2120 {
2121 bool has_trailing_slash = false;
2122
2123 /* Find the offset of the last slash */
2124 const char* last_slash = strrchr(path, OS_PATH_SEPARATOR);
2125
2126 if (!last_slash) {
2127 /* No slash in the path, return NULL */
2128 return(NULL);
2129 }
2130
2131 /* Ok, there is a slash. Is there anything after it? */
2132 if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
2133 has_trailing_slash = true;
2134 }
2135
2136 /* Reduce repetative slashes. */
2137 while (last_slash > path
2138 && last_slash[-1] == OS_PATH_SEPARATOR) {
2139 last_slash--;
2140 }
2141
2142 /* Check for the root of a drive. */
2143 if (os_file_is_root(path, last_slash)) {
2144 return(NULL);
2145 }
2146
2147 /* If a trailing slash prevented the first strrchr() from trimming
2148 the last component of the path, trim that component now. */
2149 if (has_trailing_slash) {
2150 /* Back up to the previous slash. */
2151 last_slash--;
2152 while (last_slash > path
2153 && last_slash[0] != OS_PATH_SEPARATOR) {
2154 last_slash--;
2155 }
2156
2157 /* Reduce repetative slashes. */
2158 while (last_slash > path
2159 && last_slash[-1] == OS_PATH_SEPARATOR) {
2160 last_slash--;
2161 }
2162 }
2163
2164 /* Check for the root of a drive. */
2165 if (os_file_is_root(path, last_slash)) {
2166 return(NULL);
2167 }
2168
2169 if (last_slash - path < 0) {
2170 /* Sanity check, it prevents gcc from trying to handle this case which
2171 * results in warnings for some optimized builds */
2172 return (NULL);
2173 }
2174
2175 /* Non-trivial directory component */
2176
2177 return(mem_strdupl(path, last_slash - path));
2178 }
2179 #ifdef UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR
2180
2181 /* Test the function os_file_get_parent_dir. */
2182 void
test_os_file_get_parent_dir(const char * child_dir,const char * expected_dir)2183 test_os_file_get_parent_dir(
2184 const char* child_dir,
2185 const char* expected_dir)
2186 {
2187 char* child = mem_strdup(child_dir);
2188 char* expected = expected_dir == NULL ? NULL
2189 : mem_strdup(expected_dir);
2190
2191 /* os_file_get_parent_dir() assumes that separators are
2192 converted to OS_PATH_SEPARATOR. */
2193 os_normalize_path(child);
2194 os_normalize_path(expected);
2195
2196 char* parent = os_file_get_parent_dir(child);
2197
2198 bool unexpected = (expected == NULL
2199 ? (parent != NULL)
2200 : (0 != strcmp(parent, expected)));
2201 if (unexpected) {
2202 ib::fatal() << "os_file_get_parent_dir('" << child
2203 << "') returned '" << parent
2204 << "', instead of '" << expected << "'.";
2205 }
2206 ut_free(parent);
2207 ut_free(child);
2208 ut_free(expected);
2209 }
2210
2211 /* Test the function os_file_get_parent_dir. */
2212 void
unit_test_os_file_get_parent_dir()2213 unit_test_os_file_get_parent_dir()
2214 {
2215 test_os_file_get_parent_dir("/usr/lib/a", "/usr/lib");
2216 test_os_file_get_parent_dir("/usr/", NULL);
2217 test_os_file_get_parent_dir("//usr//", NULL);
2218 test_os_file_get_parent_dir("usr", NULL);
2219 test_os_file_get_parent_dir("usr//", NULL);
2220 test_os_file_get_parent_dir("/", NULL);
2221 test_os_file_get_parent_dir("//", NULL);
2222 test_os_file_get_parent_dir(".", NULL);
2223 test_os_file_get_parent_dir("..", NULL);
2224 # ifdef _WIN32
2225 test_os_file_get_parent_dir("D:", NULL);
2226 test_os_file_get_parent_dir("D:/", NULL);
2227 test_os_file_get_parent_dir("D:\\", NULL);
2228 test_os_file_get_parent_dir("D:/data", NULL);
2229 test_os_file_get_parent_dir("D:/data/", NULL);
2230 test_os_file_get_parent_dir("D:\\data\\", NULL);
2231 test_os_file_get_parent_dir("D:///data/////", NULL);
2232 test_os_file_get_parent_dir("D:\\\\\\data\\\\\\\\", NULL);
2233 test_os_file_get_parent_dir("D:/data//a", "D:/data");
2234 test_os_file_get_parent_dir("D:\\data\\\\a", "D:\\data");
2235 test_os_file_get_parent_dir("D:///data//a///b/", "D:///data//a");
2236 test_os_file_get_parent_dir("D:\\\\\\data\\\\a\\\\\\b\\", "D:\\\\\\data\\\\a");
2237 #endif /* _WIN32 */
2238 }
2239 #endif /* UNIV_ENABLE_UNIT_TEST_GET_PARENT_DIR */
2240
2241
2242 /** Creates all missing subdirectories along the given path.
2243 @param[in] path Path name
2244 @return DB_SUCCESS if OK, otherwise error code. */
2245 dberr_t
os_file_create_subdirs_if_needed(const char * path)2246 os_file_create_subdirs_if_needed(
2247 const char* path)
2248 {
2249 if (srv_read_only_mode) {
2250
2251 ib::error()
2252 << "read only mode set. Can't create "
2253 << "subdirectories '" << path << "'";
2254
2255 return(DB_READ_ONLY);
2256
2257 }
2258
2259 char* subdir = os_file_get_parent_dir(path);
2260
2261 if (subdir == NULL) {
2262 /* subdir is root or cwd, nothing to do */
2263 return(DB_SUCCESS);
2264 }
2265
2266 /* Test if subdir exists */
2267 os_file_type_t type;
2268 bool subdir_exists;
2269 bool success = os_file_status(subdir, &subdir_exists, &type);
2270
2271 if (success && !subdir_exists) {
2272
2273 /* Subdir does not exist, create it */
2274 dberr_t err = os_file_create_subdirs_if_needed(subdir);
2275
2276 if (err != DB_SUCCESS) {
2277
2278 ut_free(subdir);
2279
2280 return(err);
2281 }
2282
2283 success = os_file_create_directory(subdir, false);
2284 }
2285
2286 ut_free(subdir);
2287
2288 return(success ? DB_SUCCESS : DB_ERROR);
2289 }
2290
2291 /** Allocate the buffer for IO on a transparently compressed table.
2292 @param[in] type IO flags
2293 @param[out] buf buffer to read or write
2294 @param[in,out] n number of bytes to read/write, starting from
2295 offset
2296 @return pointer to allocated page, compressed data is written to the offset
2297 that is aligned on the disk sector size */
2298 static
2299 Block*
os_file_compress_page(IORequest & type,void * & buf,ulint * n)2300 os_file_compress_page(
2301 IORequest& type,
2302 void*& buf,
2303 ulint* n)
2304 {
2305 ut_ad(!type.is_log());
2306 ut_ad(type.is_write());
2307 ut_ad(type.is_compressed());
2308
2309 ulint n_alloc = *n * 2;
2310
2311 ut_a(n_alloc <= UNIV_PAGE_SIZE_MAX * 2);
2312 ut_a(type.compression_algorithm().m_type != Compression::LZ4
2313 || static_cast<ulint>(LZ4_COMPRESSBOUND(*n)) < n_alloc);
2314
2315 Block* block = os_alloc_block();
2316
2317 ulint old_compressed_len;
2318 ulint compressed_len = *n;
2319
2320 old_compressed_len = mach_read_from_2(
2321 reinterpret_cast<byte*>(buf)
2322 + FIL_PAGE_COMPRESS_SIZE_V1);
2323
2324 if (old_compressed_len > 0) {
2325 old_compressed_len = ut_calc_align(
2326 old_compressed_len + FIL_PAGE_DATA,
2327 type.block_size());
2328 } else {
2329 old_compressed_len = *n;
2330 }
2331
2332 byte* compressed_page;
2333
2334 compressed_page = static_cast<byte*>(
2335 ut_align(block->m_ptr, os_io_ptr_align));
2336
2337 byte* buf_ptr;
2338
2339 buf_ptr = os_file_compress_page(
2340 type.compression_algorithm(),
2341 type.block_size(),
2342 reinterpret_cast<byte*>(buf),
2343 *n,
2344 compressed_page,
2345 &compressed_len,
2346 type.encryption_algorithm().m_type == Encryption::KEYRING &&
2347 type.encryption_algorithm().m_key != NULL);
2348
2349 if (buf_ptr != buf) {
2350 /* Set new compressed size to uncompressed page. */
2351 memcpy(reinterpret_cast<byte*>(buf) + FIL_PAGE_COMPRESS_SIZE_V1,
2352 buf_ptr + FIL_PAGE_COMPRESS_SIZE_V1, 2);
2353
2354 buf = buf_ptr;
2355 *n = compressed_len;
2356
2357 if (compressed_len >= old_compressed_len) {
2358
2359 ut_ad(old_compressed_len <= UNIV_PAGE_SIZE);
2360
2361 type.clear_punch_hole();
2362 }
2363 }
2364
2365 return(block);
2366 }
2367
2368 /** Encrypt a page content when write it to disk.
2369 @param[in] type IO flags
2370 @param[out] buf buffer to read or write
2371 @param[in,out] n number of bytes to read/write, starting from
2372 offset
2373 @return pointer to the encrypted page */
2374 static
2375 Block*
os_file_encrypt_page(const IORequest & type,void * & buf,ulint * n)2376 os_file_encrypt_page(
2377 const IORequest& type,
2378 void*& buf,
2379 ulint* n)
2380 {
2381
2382 byte* encrypted_page;
2383 ulint encrypted_len = *n;
2384 byte* buf_ptr;
2385 Encryption encryption(type.encryption_algorithm());
2386
2387 ut_ad(type.is_write());
2388 ut_ad(type.is_encrypted());
2389
2390 Block* block = os_alloc_block();
2391
2392 encrypted_page = static_cast<byte*>(
2393 ut_align(block->m_ptr, os_io_ptr_align));
2394
2395 buf_ptr = encryption.encrypt(type,
2396 reinterpret_cast<byte*>(buf), *n,
2397 encrypted_page, &encrypted_len);
2398
2399 bool encrypted = buf_ptr != buf;
2400
2401 if (encrypted) {
2402
2403 buf = buf_ptr;
2404 *n = encrypted_len;
2405 }
2406
2407 return(block);
2408 }
2409
2410 /** Encrypt log blocks content when write it to disk.
2411 @param[in] type IO flags
2412 @param[in,out] buf buffer to read or write
2413 @param[in,out] scratch buffer for encrypting log
2414 @param[in,out] n number of bytes to read/write, starting from
2415 offset
2416 @return pointer to the encrypted log blocks */
2417 static
2418 Block*
os_file_encrypt_log(const IORequest & type,void * & buf,byte * & scratch,ulint * n)2419 os_file_encrypt_log(
2420 const IORequest& type,
2421 void*& buf,
2422 byte*& scratch,
2423 ulint* n)
2424 {
2425
2426 byte* buf_ptr;
2427 Block* block = NULL;
2428
2429 ut_ad(type.is_write());
2430 ut_ad(type.is_encrypted());
2431 ut_ad(type.is_log());
2432 ut_ad(*n % OS_FILE_LOG_BLOCK_SIZE == 0);
2433
2434 if (*n <= BUFFER_BLOCK_SIZE - os_io_ptr_align) {
2435 block = os_alloc_block();
2436 buf_ptr = block->m_ptr;
2437 scratch = NULL;
2438 } else {
2439 buf_ptr = static_cast<byte*>(
2440 ut_malloc_nokey(*n + os_io_ptr_align));
2441 scratch = buf_ptr;
2442 }
2443
2444 byte* encrypted_log;
2445 encrypted_log = static_cast<byte*>(ut_align(buf_ptr, os_io_ptr_align));
2446
2447 ulint encrypted_len = *n;
2448 Encryption encryption(type.encryption_algorithm());
2449 encrypted_log = encryption.encrypt_log(type,
2450 reinterpret_cast<byte*>(buf),
2451 *n, encrypted_log,
2452 &encrypted_len);
2453
2454 bool encrypted = encrypted_log != buf;
2455
2456 if (encrypted) {
2457 buf = encrypted_log;
2458 *n = encrypted_len;
2459 }
2460
2461 return(block);
2462 }
2463
2464 #ifndef _WIN32
2465
2466 /** Do the read/write
2467 @param[in] request The IO context and type
2468 @return the number of bytes read/written or negative value on error */
2469 ssize_t
execute(const IORequest & request)2470 SyncFileIO::execute(const IORequest& request)
2471 {
2472 ssize_t n_bytes;
2473
2474 if (request.is_read()) {
2475 n_bytes = pread(m_fh, m_buf, m_n, m_offset);
2476 } else {
2477 ut_ad(request.is_write());
2478 n_bytes = pwrite(m_fh, m_buf, m_n, m_offset);
2479 }
2480
2481 return(n_bytes);
2482 }
2483
2484 MY_ATTRIBUTE((warn_unused_result))
2485 static std::string
os_file_find_path_for_fd(os_file_t fd)2486 os_file_find_path_for_fd(
2487 os_file_t fd)
2488 {
2489 char fdname[FN_REFLEN];
2490 snprintf(fdname, sizeof fdname, "/proc/%d/fd/%d", getpid(), fd);
2491 char filename[FN_REFLEN];
2492 const int err_filename = my_readlink(filename, fdname, MYF(0));
2493 return std::string((err_filename != -1) ? filename : "");
2494 }
2495
2496 /** Free storage space associated with a section of the file.
2497 @param[in] fh Open file handle
2498 @param[in] off Starting offset (SEEK_SET)
2499 @param[in] len Size of the hole
2500 @return DB_SUCCESS or error code */
2501 static
2502 dberr_t
os_file_punch_hole_posix(os_file_t fh,os_offset_t off,os_offset_t len)2503 os_file_punch_hole_posix(
2504 os_file_t fh,
2505 os_offset_t off,
2506 os_offset_t len)
2507 {
2508 #ifdef HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE
2509 const int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
2510
2511 int ret = fallocate(fh, mode, off, len);
2512
2513 if (ret == 0) {
2514 return(DB_SUCCESS);
2515 }
2516
2517 ut_a(ret == -1);
2518
2519 if (errno == ENOTSUP) {
2520 return(DB_IO_NO_PUNCH_HOLE);
2521 }
2522
2523 const std::string fd_path = os_file_find_path_for_fd(fh);
2524 if (!fd_path.empty()) {
2525 ib::warn()
2526 << "fallocate(" << fh << " ("
2527 << fd_path << "), FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
2528 << off << ", " << len << ") returned errno: "
2529 << errno;
2530 } else {
2531 ib::warn()
2532 << "fallocate(" << fh
2533 <<", FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, "
2534 << off << ", " << len << ") returned errno: "
2535 << errno;
2536 }
2537
2538 return(DB_IO_ERROR);
2539
2540 #elif defined(UNIV_SOLARIS)
2541
2542 // Use F_FREESP
2543
2544 #endif /* HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE */
2545
2546 return(DB_IO_NO_PUNCH_HOLE);
2547 }
2548
2549 #if defined(LINUX_NATIVE_AIO)
2550
2551 /** Linux native AIO handler */
2552 class LinuxAIOHandler {
2553 public:
2554 /**
2555 @param[in] global_segment The global segment*/
LinuxAIOHandler(ulint global_segment)2556 LinuxAIOHandler(ulint global_segment)
2557 :
2558 m_global_segment(global_segment)
2559 {
2560 /* Should never be doing Sync IO here. */
2561 ut_a(m_global_segment != ULINT_UNDEFINED);
2562
2563 /* Find the array and the local segment. */
2564
2565 m_segment = AIO::get_array_and_local_segment(
2566 &m_array, m_global_segment);
2567
2568 m_n_slots = m_array->slots_per_segment();
2569 }
2570
2571 /** Destructor */
~LinuxAIOHandler()2572 ~LinuxAIOHandler()
2573 {
2574 // No op
2575 }
2576
2577 /**
2578 Process a Linux AIO request
2579 @param[out] m1 the messages passed with the
2580 @param[out] m2 AIO request; note that in case the
2581 AIO operation failed, these output
2582 parameters are valid and can be used to
2583 restart the operation.
2584 @param[out] request IO context
2585 @return DB_SUCCESS or error code */
2586 dberr_t poll(fil_node_t** m1, void** m2, IORequest* request);
2587
2588 private:
2589 /** Resubmit an IO request that was only partially successful
2590 @param[in,out] slot Request to resubmit
2591 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2592 dberr_t resubmit(Slot* slot);
2593
2594 /** Check if the AIO succeeded
2595 @param[in,out] slot The slot to check
2596 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2597 DB_IO_ERROR on all other errors */
2598 dberr_t check_state(Slot* slot);
2599
2600 /** @return true if a shutdown was detected */
is_shutdown() const2601 bool is_shutdown() const
2602 {
2603 return(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2604 && !buf_page_cleaner_is_active);
2605 }
2606
2607 /** If no slot was found then the m_array->m_mutex will be released.
2608 @param[out] n_pending The number of pending IOs
2609 @return NULL or a slot that has completed IO */
2610 Slot* find_completed_slot(ulint* n_pending);
2611
2612 /** This is called from within the IO-thread. If there are no completed
2613 IO requests in the slot array, the thread calls this function to
2614 collect more requests from the Linux kernel.
2615 The IO-thread waits on io_getevents(), which is a blocking call, with
2616 a timeout value. Unless the system is very heavy loaded, keeping the
2617 IO-thread very busy, the io-thread will spend most of its time waiting
2618 in this function.
2619 The IO-thread also exits in this function. It checks server status at
2620 each wakeup and that is why we use timed wait in io_getevents(). */
2621 void collect();
2622
2623 private:
2624 /** Slot array */
2625 AIO* m_array;
2626
2627 /** Number of slots inthe local segment */
2628 ulint m_n_slots;
2629
2630 /** The local segment to check */
2631 ulint m_segment;
2632
2633 /** The global segment */
2634 ulint m_global_segment;
2635 };
2636
2637 /** Resubmit an IO request that was only partially successful
2638 @param[in,out] slot Request to resubmit
2639 @return DB_SUCCESS or DB_FAIL if the IO resubmit request failed */
2640 dberr_t
resubmit(Slot * slot)2641 LinuxAIOHandler::resubmit(Slot* slot)
2642 {
2643 #ifdef UNIV_DEBUG
2644 /* Bytes already read/written out */
2645 ulint n_bytes = slot->ptr - slot->buf;
2646
2647 ut_ad(m_array->is_mutex_owned());
2648
2649 ut_ad(n_bytes < slot->original_len);
2650 ut_ad(static_cast<ulint>(slot->n_bytes) < slot->original_len - n_bytes);
2651 /* Partial read or write scenario */
2652 ut_ad(slot->len >= static_cast<ulint>(slot->n_bytes));
2653 #endif /* UNIV_DEBUG */
2654
2655 slot->len -= slot->n_bytes;
2656 slot->ptr += slot->n_bytes;
2657 slot->offset += slot->n_bytes;
2658
2659 /* Resetting the bytes read/written */
2660 slot->n_bytes = 0;
2661 slot->io_already_done = false;
2662
2663 /* make sure that slot->offset fits in off_t */
2664 ut_ad(sizeof(off_t) >= sizeof(os_offset_t));
2665
2666 struct iocb* iocb = &slot->control;
2667 if (slot->type.is_read()) {
2668 io_prep_pread(
2669 iocb,
2670 slot->file.m_file,
2671 slot->ptr,
2672 slot->len,
2673 slot->offset);
2674
2675 } else {
2676
2677 ut_a(slot->type.is_write());
2678
2679 io_prep_pwrite(
2680 iocb,
2681 slot->file.m_file,
2682 slot->ptr,
2683 slot->len,
2684 slot->offset);
2685 }
2686
2687 iocb->data = slot;
2688
2689 /* Resubmit an I/O request */
2690 int ret = io_submit(m_array->io_ctx(m_segment), 1, &iocb);
2691
2692 if (ret < -1) {
2693 errno = -ret;
2694 }
2695
2696 return(ret < 0 ? DB_IO_PARTIAL_FAILED : DB_SUCCESS);
2697 }
2698
2699 /** Check if the AIO succeeded
2700 @param[in,out] slot The slot to check
2701 @return DB_SUCCESS, DB_FAIL if the operation should be retried or
2702 DB_IO_ERROR on all other errors */
2703 dberr_t
check_state(Slot * slot)2704 LinuxAIOHandler::check_state(Slot* slot)
2705 {
2706 ut_ad(m_array->is_mutex_owned());
2707
2708 /* Note that it may be that there is more then one completed
2709 IO requests. We process them one at a time. We may have a case
2710 here to improve the performance slightly by dealing with all
2711 requests in one sweep. */
2712
2713 srv_set_io_thread_op_info(
2714 m_global_segment, "processing completed aio requests");
2715
2716 ut_ad(slot->io_already_done);
2717
2718 dberr_t err;
2719
2720 if (slot->ret == 0) {
2721
2722 err = AIOHandler::post_io_processing(slot);
2723
2724 } else {
2725 errno = -slot->ret;
2726
2727 /* os_file_handle_error does tell us if we should retry
2728 this IO. As it stands now, we don't do this retry when
2729 reaping requests from a different context than
2730 the dispatcher. This non-retry logic is the same for
2731 Windows and Linux native AIO.
2732 We should probably look into this to transparently
2733 re-submit the IO. */
2734 os_file_handle_error(slot->name, "Linux aio");
2735
2736 err = DB_IO_ERROR;
2737 }
2738
2739 return(err);
2740 }
2741
2742 /** If no slot was found then the m_array->m_mutex will be released.
2743 @param[out] n_pending The number of pending IOs
2744 @return NULL or a slot that has completed IO */
2745 Slot*
find_completed_slot(ulint * n_pending)2746 LinuxAIOHandler::find_completed_slot(ulint* n_pending)
2747 {
2748 ulint offset = m_n_slots * m_segment;
2749
2750 *n_pending = 0;
2751
2752 m_array->acquire();
2753
2754 Slot* slot = m_array->at(offset);
2755
2756 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
2757
2758 if (slot->is_reserved) {
2759
2760 ++*n_pending;
2761
2762 if (slot->io_already_done) {
2763
2764 /* Something for us to work on.
2765 Note: We don't release the mutex. */
2766 return(slot);
2767 }
2768 }
2769 }
2770
2771 m_array->release();
2772
2773 return(NULL);
2774 }
2775
2776 /** This function is only used in Linux native asynchronous i/o. This is
2777 called from within the io-thread. If there are no completed IO requests
2778 in the slot array, the thread calls this function to collect more
2779 requests from the kernel.
2780 The io-thread waits on io_getevents(), which is a blocking call, with
2781 a timeout value. Unless the system is very heavy loaded, keeping the
2782 io-thread very busy, the io-thread will spend most of its time waiting
2783 in this function.
2784 The io-thread also exits in this function. It checks server status at
2785 each wakeup and that is why we use timed wait in io_getevents(). */
2786 void
collect()2787 LinuxAIOHandler::collect()
2788 {
2789 ut_ad(m_n_slots > 0);
2790 ut_ad(m_array != NULL);
2791 ut_ad(m_segment < m_array->get_n_segments());
2792
2793 /* Which io_context we are going to use. */
2794 io_context* io_ctx = m_array->io_ctx(m_segment);
2795
2796 /* Starting point of the m_segment we will be working on. */
2797 ulint start_pos = m_segment * m_n_slots;
2798
2799 /* End point. */
2800 ulint end_pos = start_pos + m_n_slots;
2801
2802 for (;;) {
2803 struct io_event* events;
2804
2805 /* Which part of event array we are going to work on. */
2806 events = m_array->io_events(m_segment * m_n_slots);
2807
2808 /* Initialize the events. */
2809 memset(events, 0, sizeof(*events) * m_n_slots);
2810
2811 /* The timeout value is arbitrary. We probably need
2812 to experiment with it a little. */
2813 struct timespec timeout;
2814
2815 timeout.tv_sec = 0;
2816 timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
2817
2818 int ret;
2819
2820 ret = io_getevents(io_ctx, 1, m_n_slots, events, &timeout);
2821
2822 for (int i = 0; i < ret; ++i) {
2823
2824 struct iocb* iocb;
2825
2826 iocb = reinterpret_cast<struct iocb*>(events[i].obj);
2827 ut_a(iocb != NULL);
2828
2829 Slot* slot = reinterpret_cast<Slot*>(iocb->data);
2830
2831 /* Some sanity checks. */
2832 ut_a(slot != NULL);
2833 ut_a(slot->is_reserved);
2834
2835 /* We are not scribbling previous segment. */
2836 ut_a(slot->pos >= start_pos);
2837
2838 /* We have not overstepped to next segment. */
2839 ut_a(slot->pos < end_pos);
2840
2841 /* We never compress/decompress the first page */
2842
2843 if (slot->offset > 0
2844 && !slot->skip_punch_hole
2845 && slot->type.is_compression_enabled()
2846 && !slot->type.is_log()
2847 && slot->type.is_write()
2848 && slot->type.is_compressed()
2849 && slot->type.punch_hole()) {
2850
2851 slot->err = AIOHandler::io_complete(slot);
2852 } else {
2853 slot->err = DB_SUCCESS;
2854 }
2855
2856 /* Mark this request as completed. The error handling
2857 will be done in the calling function. */
2858 m_array->acquire();
2859
2860 /* events[i].res2 should always be ZERO */
2861 ut_ad(events[i].res2 == 0);
2862 slot->io_already_done = true;
2863
2864 /*Even though events[i].res is an unsigned number
2865 in libaio, it is used to return a negative value
2866 (negated errno value) to indicate error and a positive
2867 value to indicate number of bytes read or written. */
2868
2869 if (events[i].res > slot->len) {
2870 /* failure */
2871 slot->n_bytes = 0;
2872 slot->ret = events[i].res;
2873 } else {
2874 /* success */
2875 slot->n_bytes = events[i].res;
2876 slot->ret = 0;
2877 }
2878 m_array->release();
2879 }
2880
2881 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
2882 || !buf_page_cleaner_is_active
2883 || ret > 0) {
2884
2885 break;
2886 }
2887
2888 /* This error handling is for any error in collecting the
2889 IO requests. The errors, if any, for any particular IO
2890 request are simply passed on to the calling routine. */
2891
2892 switch (ret) {
2893 case -EAGAIN:
2894 /* Not enough resources! Try again. */
2895
2896 case -EINTR:
2897 /* Interrupted! The behaviour in case of an interrupt.
2898 If we have some completed IOs available then the
2899 return code will be the number of IOs. We get EINTR
2900 only if there are no completed IOs and we have been
2901 interrupted. */
2902
2903 case 0:
2904 /* No pending request! Go back and check again. */
2905
2906 continue;
2907 }
2908
2909 /* All other errors should cause a trap for now. */
2910 ib::fatal()
2911 << "Unexpected ret_code[" << ret
2912 << "] from io_getevents()!";
2913
2914 break;
2915 }
2916 }
2917
2918 /** Process a Linux AIO request
2919 @param[out] m1 the messages passed with the
2920 @param[out] m2 AIO request; note that in case the
2921 AIO operation failed, these output
2922 parameters are valid and can be used to
2923 restart the operation.
2924 @param[out] request IO context
2925 @return DB_SUCCESS or error code */
2926 dberr_t
poll(fil_node_t ** m1,void ** m2,IORequest * request)2927 LinuxAIOHandler::poll(fil_node_t** m1, void** m2, IORequest* request)
2928 {
2929 dberr_t err;
2930 Slot* slot;
2931
2932 /* Loop until we have found a completed request. */
2933 for (;;) {
2934
2935 ulint n_pending;
2936
2937 slot = find_completed_slot(&n_pending);
2938
2939 if (slot != NULL) {
2940
2941 ut_ad(m_array->is_mutex_owned());
2942
2943 err = check_state(slot);
2944
2945 /* DB_FAIL is not a hard error, we should retry */
2946 if (err != DB_FAIL) {
2947 break;
2948 }
2949
2950 /* Partial IO, resubmit request for
2951 remaining bytes to read/write */
2952 err = resubmit(slot);
2953
2954 if (err != DB_SUCCESS) {
2955 break;
2956 }
2957
2958 m_array->release();
2959
2960 } else if (is_shutdown() && n_pending == 0) {
2961
2962 /* There is no completed request. If there is
2963 no pending request at all, and the system is
2964 being shut down, exit. */
2965
2966 *m1 = NULL;
2967 *m2 = NULL;
2968
2969 return(DB_SUCCESS);
2970
2971 } else {
2972
2973 /* Wait for some request. Note that we return
2974 from wait if we have found a request. */
2975
2976 srv_set_io_thread_op_info(
2977 m_global_segment,
2978 "waiting for completed aio requests");
2979
2980 collect();
2981 }
2982 }
2983
2984 if (err == DB_IO_PARTIAL_FAILED) {
2985 /* Aborting in case of submit failure */
2986 ib::fatal()
2987 << "Native Linux AIO interface. "
2988 "io_submit() call failed when "
2989 "resubmitting a partial I/O "
2990 "request on the file " << slot->name
2991 << ".";
2992 }
2993
2994 *m1 = slot->m1;
2995 *m2 = slot->m2;
2996
2997 *request = slot->type;
2998
2999 m_array->release(slot);
3000
3001 m_array->release();
3002
3003 return(err);
3004 }
3005
3006 /** This function is only used in Linux native asynchronous i/o.
3007 Waits for an aio operation to complete. This function is used to wait for
3008 the completed requests. The aio array of pending requests is divided
3009 into segments. The thread specifies which segment or slot it wants to wait
3010 for. NOTE: this function will also take care of freeing the aio slot,
3011 therefore no other thread is allowed to do the freeing!
3012
3013 @param[in] global_seg segment number in the aio array
3014 to wait for; segment 0 is the ibuf
3015 i/o thread, segment 1 is log i/o thread,
3016 then follow the non-ibuf read threads,
3017 and the last are the non-ibuf write
3018 threads.
3019 @param[out] m1 the messages passed with the
3020 @param[out] m2 AIO request; note that in case the
3021 AIO operation failed, these output
3022 parameters are valid and can be used to
3023 restart the operation.
3024 @param[out]xi request IO context
3025 @return DB_SUCCESS if the IO was successful */
3026 static
3027 dberr_t
os_aio_linux_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * request)3028 os_aio_linux_handler(
3029 ulint global_segment,
3030 fil_node_t** m1,
3031 void** m2,
3032 IORequest* request)
3033 {
3034 LinuxAIOHandler handler(global_segment);
3035
3036 dberr_t err = handler.poll(m1, m2, request);
3037
3038 if (err == DB_IO_NO_PUNCH_HOLE) {
3039 fil_no_punch_hole(*m1);
3040 err = DB_SUCCESS;
3041 }
3042
3043 return(err);
3044 }
3045 #endif
3046
3047 /** Submit buffered AIO requests on the given segment to the kernel.
3048 (low level function).
3049 @param[in] acquire_mutex specifies whether to lock array mutex */
3050 void
os_aio_dispatch_read_array_submit_low(bool acquire_mutex MY_ATTRIBUTE ((unused)))3051 AIO::os_aio_dispatch_read_array_submit_low(
3052 bool acquire_mutex MY_ATTRIBUTE((unused)))
3053 {
3054 os_aio_dispatch_read_array_submit_low_for_array(acquire_mutex, s_reads);
3055 if (s_ibuf != NULL) {
3056 os_aio_dispatch_read_array_submit_low_for_array(acquire_mutex, s_ibuf);
3057 }
3058 }
3059
3060 /** Submit buffered AIO requests on the array to the kernel.
3061 (low level function).
3062 @param[in] acquire_mutex specifies whether to lock array mutex
3063 @param[in] array for which to submit IO */
3064 void
os_aio_dispatch_read_array_submit_low_for_array(bool acquire_mutex MY_ATTRIBUTE ((unused)),const AIO * arr)3065 AIO::os_aio_dispatch_read_array_submit_low_for_array(
3066 bool acquire_mutex MY_ATTRIBUTE((unused)), const AIO* arr)
3067 {
3068 if (!srv_use_native_aio) {
3069 return;
3070 }
3071 #if defined(LINUX_NATIVE_AIO)
3072 const AIO* array = arr;
3073 ulint total_submitted = 0;
3074 if (acquire_mutex)
3075 array->acquire();
3076 /* Submit aio requests buffered on all segments. */
3077 ut_ad(array->m_pending);
3078 ut_ad(array->m_count);
3079 for (ulint i = 0; i < array->m_n_segments; i++) {
3080 const int count = array->m_count[i];
3081 int offset = 0;
3082 while (offset != count) {
3083 struct iocb** const iocb_array = array->m_pending
3084 + i * array->m_slots.size()
3085 / array->m_n_segments
3086 + offset;
3087 const int partial_count = count - offset;
3088 /* io_submit() returns number of successfully queued
3089 requests or (-errno).
3090 It returns 0 only if the number of iocb blocks passed
3091 is also 0. */
3092 const int submitted = io_submit(
3093 array->m_aio_ctx[i],
3094 partial_count, iocb_array);
3095
3096 /* This assertion prevents infinite loop in both
3097 debug and release modes. */
3098 ut_a(submitted != 0);
3099
3100 if (submitted < 0) {
3101 /* Terminating with fatal error */
3102 const char* errmsg =
3103 strerror(-submitted);
3104 ib::fatal() << "Trying to sumbit " << count
3105 << " aio requests, io_submit() set "
3106 << "errno to " << -submitted << ": "
3107 << (errmsg ? errmsg : "<unknown>");
3108 }
3109 ut_ad(submitted <= partial_count);
3110 if (submitted < partial_count)
3111 {
3112 ib::warn() << "Trying to sumbit " << count
3113 << " aio requests, io_submit() "
3114 << "submitted only " << submitted;
3115 }
3116 offset += submitted;
3117 }
3118 total_submitted += count;
3119 }
3120 /* Reset the aio request buffer. */
3121 memset(array->m_pending, 0x0,
3122 sizeof(struct iocb*) * array->m_slots.size());
3123 memset(array->m_count, 0x0, sizeof(ulint) * array->m_n_segments);
3124 if (acquire_mutex)
3125 array->release();
3126
3127 srv_stats.n_aio_submitted.add(total_submitted);
3128 #endif
3129 }
3130
3131 /** Submit buffered AIO requests on the given segment to the kernel. */
3132 void
os_aio_dispatch_read_array_submit()3133 os_aio_dispatch_read_array_submit()
3134 {
3135 AIO::os_aio_dispatch_read_array_submit_low(true);
3136 }
3137
3138 #if defined(LINUX_NATIVE_AIO)
3139 /** Dispatch an AIO request to the kernel.
3140 @param[in,out] slot an already reserved slot
3141 @param[in] should_buffer should buffer the request
3142 rather than submit
3143 @return true on success. */
3144 bool
linux_dispatch(Slot * slot,bool should_buffer)3145 AIO::linux_dispatch(Slot* slot, bool should_buffer)
3146 {
3147 ut_ad(slot);
3148 ut_a(slot->is_reserved);
3149 ut_ad(slot->type.validate());
3150
3151 /* Find out what we are going to work with.
3152 The iocb struct is directly in the slot.
3153 The io_context is one per segment. */
3154
3155 struct iocb* iocb = &slot->control;
3156
3157 ulint slots_per_segment = m_slots.size() / m_n_segments;
3158 ulint io_ctx_index = slot->pos / slots_per_segment;
3159
3160 if (should_buffer) {
3161 ut_ad(this == s_reads || this == s_ibuf);
3162
3163 acquire();
3164 /* There are m_slots.size() elements in m_pending,
3165 which is divided into m_n_segments area of equal size.
3166 The iocb of each segment are buffered in its corresponding area
3167 in the pending array consecutively as they come.
3168 m_count[i] records the number of buffered aio requests
3169 in the ith segment.*/
3170 ut_ad(m_count);
3171 ulint& count = m_count[io_ctx_index];
3172 ut_ad(count != slots_per_segment);
3173 ulint n = io_ctx_index * slots_per_segment + count;
3174 ut_ad(m_pending);
3175 m_pending[n] = iocb;
3176 ++count;
3177 if (count == slots_per_segment) {
3178 AIO::os_aio_dispatch_read_array_submit_low_for_array(false, this);
3179 }
3180 release();
3181 return(true);
3182 }
3183 /* Submit the given request. */
3184 int ret = io_submit(m_aio_ctx[io_ctx_index], 1, &iocb);
3185
3186 /* io_submit() returns number of successfully queued requests
3187 or -errno. */
3188
3189 if (ret != 1) {
3190 errno = -ret;
3191 }
3192
3193 return(ret == 1);
3194 }
3195
3196 /** Creates an io_context for native linux AIO.
3197 @param[in] max_events number of events
3198 @param[out] io_ctx io_ctx to initialize.
3199 @return true on success. */
3200 bool
linux_create_io_ctx(ulint max_events,io_context_t * io_ctx)3201 AIO::linux_create_io_ctx(
3202 ulint max_events,
3203 io_context_t* io_ctx)
3204 {
3205 ssize_t n_retries = 0;
3206
3207 for (;;) {
3208
3209 memset(io_ctx, 0x0, sizeof(*io_ctx));
3210
3211 /* Initialize the io_ctx. Tell it how many pending
3212 IO requests this context will handle. */
3213
3214 int ret = io_setup(max_events, io_ctx);
3215
3216 if (ret == 0) {
3217 /* Success. Return now. */
3218 return(true);
3219 }
3220
3221 /* If we hit EAGAIN we'll make a few attempts before failing. */
3222
3223 switch (ret) {
3224 case -EAGAIN:
3225 if (n_retries == 0) {
3226 /* First time around. */
3227 ib::warn()
3228 << "io_setup() failed with EAGAIN."
3229 " Will make "
3230 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
3231 << " attempts before giving up.";
3232 }
3233
3234 if (n_retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
3235
3236 ++n_retries;
3237
3238 ib::warn()
3239 << "io_setup() attempt "
3240 << n_retries << ".";
3241
3242 os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
3243
3244 continue;
3245 }
3246
3247 /* Have tried enough. Better call it a day. */
3248 ib::error()
3249 << "io_setup() failed with EAGAIN after "
3250 << OS_AIO_IO_SETUP_RETRY_ATTEMPTS
3251 << " attempts.";
3252 break;
3253
3254 case -ENOSYS:
3255 ib::error()
3256 << "Linux Native AIO interface"
3257 " is not supported on this platform. Please"
3258 " check your OS documentation and install"
3259 " appropriate binary of InnoDB.";
3260
3261 break;
3262
3263 default:
3264 ib::error()
3265 << "Linux Native AIO setup"
3266 << " returned following error["
3267 << ret << "]";
3268 break;
3269 }
3270
3271 ib::info()
3272 << "You can disable Linux Native AIO by"
3273 " setting innodb_use_native_aio = 0 in my.cnf";
3274
3275 break;
3276 }
3277
3278 return(false);
3279 }
3280
3281 /** Checks if the system supports native linux aio. On some kernel
3282 versions where native aio is supported it won't work on tmpfs. In such
3283 cases we can't use native aio as it is not possible to mix simulated
3284 and native aio.
3285 @return: true if supported, false otherwise. */
3286 bool
is_linux_native_aio_supported()3287 AIO::is_linux_native_aio_supported()
3288 {
3289 int fd;
3290 io_context_t io_ctx;
3291 char name[1000];
3292
3293 if (!linux_create_io_ctx(1, &io_ctx)) {
3294
3295 /* The platform does not support native aio. */
3296
3297 return(false);
3298
3299 } else if (!srv_read_only_mode) {
3300
3301 /* Now check if tmpdir supports native aio ops. */
3302 fd = innobase_mysql_tmpfile(NULL);
3303
3304 if (fd < 0) {
3305 ib::warn()
3306 << "Unable to create temp file to check"
3307 " native AIO support.";
3308
3309 return(false);
3310 }
3311 } else {
3312
3313 os_normalize_path(srv_log_group_home_dir);
3314
3315 ulint dirnamelen = strlen(srv_log_group_home_dir);
3316
3317 ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
3318
3319 memcpy(name, srv_log_group_home_dir, dirnamelen);
3320
3321 /* Add a path separator if needed. */
3322 if (dirnamelen && name[dirnamelen - 1] != OS_PATH_SEPARATOR) {
3323
3324 name[dirnamelen++] = OS_PATH_SEPARATOR;
3325 }
3326
3327 strcpy(name + dirnamelen, "ib_logfile0");
3328
3329 fd = ::open(name, O_RDONLY);
3330
3331 if (fd == -1) {
3332
3333 ib::warn()
3334 << "Unable to open"
3335 << " \"" << name << "\" to check native"
3336 << " AIO read support.";
3337
3338 return(false);
3339 }
3340 }
3341
3342 struct io_event io_event;
3343
3344 memset(&io_event, 0x0, sizeof(io_event));
3345
3346 byte* buf = static_cast<byte*>(ut_malloc_nokey(UNIV_PAGE_SIZE * 2));
3347 byte* ptr = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
3348
3349 struct iocb iocb;
3350
3351 /* Suppress valgrind warning. */
3352 memset(buf, 0x00, UNIV_PAGE_SIZE * 2);
3353 memset(&iocb, 0x0, sizeof(iocb));
3354
3355 struct iocb* p_iocb = &iocb;
3356
3357 if (!srv_read_only_mode) {
3358
3359 io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0);
3360
3361 } else {
3362 ut_a(UNIV_PAGE_SIZE >= 512);
3363 io_prep_pread(p_iocb, fd, ptr, 512, 0);
3364 }
3365
3366 int err = io_submit(io_ctx, 1, &p_iocb);
3367
3368 if (err >= 1) {
3369 /* Now collect the submitted IO request. */
3370 err = io_getevents(io_ctx, 1, 1, &io_event, NULL);
3371 }
3372
3373 ut_free(buf);
3374 close(fd);
3375
3376 switch (err) {
3377 case 1:
3378 return(true);
3379
3380 case -EINVAL:
3381 case -ENOSYS:
3382 ib::error()
3383 << "Linux Native AIO not supported. You can either"
3384 " move "
3385 << (srv_read_only_mode ? name : "tmpdir")
3386 << " to a file system that supports native"
3387 " AIO or you can set innodb_use_native_aio to"
3388 " FALSE to avoid this message.";
3389
3390 /* fall through. */
3391 default:
3392 ib::error()
3393 << "Linux Native AIO check on "
3394 << (srv_read_only_mode ? name : "tmpdir")
3395 << "returned error[" << -err << "]";
3396 }
3397
3398 return(false);
3399 }
3400
3401 #endif /* LINUX_NATIVE_AIO */
3402
3403 /** For an EINVAL I/O error, prints a diagnostic message if innodb_flush_method
3404 == ALL_O_DIRECT.
3405 @param[in] err C error code
3406 @return true if the diagnostic message was printed
3407 @return false if the diagnostic message does not apply */
3408 static
3409 bool
os_diagnose_all_o_direct_einval(ulint err)3410 os_diagnose_all_o_direct_einval(
3411 ulint err)
3412 {
3413 if ((err == EINVAL)
3414 && (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT)) {
3415 ib::info() << "The error might be caused by redo log I/O not "
3416 "satisfying innodb_flush_method=ALL_O_DIRECT "
3417 "requirements by the underlying file system.";
3418 if (srv_log_write_ahead_size
3419 != DEFAULT_SRV_LOG_WRITE_AHEAD_SIZE)
3420 ib::info() <<
3421 "This might be caused by an incompatible "
3422 "non-default innodb_log_write_ahead_size "
3423 "value " << srv_log_write_ahead_size;
3424 ib::info() <<
3425 "Please file a bug at https://bugs.percona.com and "
3426 "include this error message, my.cnf settings, ";
3427 ib::info() <<
3428 "and information about the file system where the redo "
3429 "log resides.";
3430 ib::info() <<
3431 "A possible workaround is to change "
3432 "innodb_flush_method value to something else "
3433 "than ALL_O_DIRECT.";
3434 return(true);
3435 }
3436 return(false);
3437 }
3438
3439 /** Retrieves the last error number if an error occurs in a file io function.
3440 The number should be retrieved before any other OS calls (because they may
3441 overwrite the error number). If the number is not known to this program,
3442 the OS error number + 100 is returned.
3443 @param[in] report_all_errors true if we want an error message
3444 printed of all errors
3445 @param[in] on_error_silent true then don't print any diagnostic
3446 to the log
3447 @return error number, or OS error number + 100 */
3448 static
3449 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)3450 os_file_get_last_error_low(
3451 bool report_all_errors,
3452 bool on_error_silent)
3453 {
3454 int err = errno;
3455
3456 if (err == 0) {
3457 return(0);
3458 }
3459
3460 if (report_all_errors
3461 || (err != ENOSPC && err != EEXIST && !on_error_silent)) {
3462
3463 ib::error()
3464 << "Operating system error number "
3465 << err
3466 << " in a file operation.";
3467
3468 if (err == ENOENT) {
3469
3470 ib::error()
3471 << "The error means the system"
3472 " cannot find the path specified.";
3473
3474 if (srv_is_being_started) {
3475
3476 ib::error()
3477 << "If you are installing InnoDB,"
3478 " remember that you must create"
3479 " directories yourself, InnoDB"
3480 " does not create them.";
3481 }
3482 } else if (err == EACCES) {
3483
3484 ib::error()
3485 << "The error means mysqld does not have"
3486 " the access rights to the directory.";
3487
3488 } else if (!os_diagnose_all_o_direct_einval(err)) {
3489 if (strerror(err) != NULL) {
3490
3491 ib::error()
3492 << "Error number " << err << " means '"
3493 << strerror(err) << "'";
3494 }
3495
3496 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
3497 }
3498 }
3499
3500 switch (err) {
3501 case ENOSPC:
3502 return(OS_FILE_DISK_FULL);
3503 case ENOENT:
3504 return(OS_FILE_NOT_FOUND);
3505 case EEXIST:
3506 return(OS_FILE_ALREADY_EXISTS);
3507 case EXDEV:
3508 case ENOTDIR:
3509 case EISDIR:
3510 return(OS_FILE_PATH_ERROR);
3511 case EAGAIN:
3512 if (srv_use_native_aio) {
3513 return(OS_FILE_AIO_RESOURCES_RESERVED);
3514 }
3515 break;
3516 case EINTR:
3517 return(OS_FILE_AIO_INTERRUPTED);
3518 break;
3519 case EACCES:
3520 return(OS_FILE_ACCESS_VIOLATION);
3521 }
3522 return(OS_FILE_ERROR_MAX + err);
3523 }
3524
3525 /** Wrapper to fsync(2) that retries the call on some errors.
3526 Returns the value 0 if successful; otherwise the value -1 is returned and
3527 the global variable errno is set to indicate the error.
3528 @param[in] file open file handle
3529 @return 0 if success, -1 otherwise */
3530 static
3531 int
os_file_fsync_posix(os_file_t file)3532 os_file_fsync_posix(
3533 os_file_t file)
3534 {
3535 ulint failures = 0;
3536
3537 for (;;) {
3538
3539 ++os_n_fsyncs;
3540
3541 int ret = fsync(file);
3542
3543 if (ret == 0) {
3544 return(ret);
3545 }
3546
3547 switch(errno) {
3548 case ENOLCK:
3549
3550 ++failures;
3551 ut_a(failures < 1000);
3552
3553 if (!(failures % 100)) {
3554
3555 ib::warn()
3556 << "fsync(): "
3557 << "No locks available; retrying";
3558 }
3559
3560 /* 0.2 sec */
3561 os_thread_sleep(200000);
3562 break;
3563
3564 case EIO: {
3565
3566 const std::string fd_path
3567 = os_file_find_path_for_fd(file);
3568 if (!fd_path.empty())
3569 ib::fatal() << "fsync(\"" << fd_path
3570 << "\") returned EIO, aborting.";
3571 else
3572 ib::fatal() << "fsync() returned EIO, aborting.";
3573 break;
3574 }
3575
3576 case EINTR:
3577
3578 ++failures;
3579 ut_a(failures < 2000);
3580 break;
3581
3582 default:
3583 ut_error;
3584 break;
3585 }
3586 }
3587
3588 ut_error;
3589
3590 return(-1);
3591 }
3592
3593 /** Check the existence and type of the given file.
3594 @param[in] path path name of file
3595 @param[out] exists true if the file exists
3596 @param[out] type Type of the file, if it exists
3597 @return true if call succeeded */
3598 bool
os_file_status_posix(const char * path,bool * exists,os_file_type_t * type)3599 os_file_status_posix(
3600 const char* path,
3601 bool* exists,
3602 os_file_type_t* type)
3603 {
3604 struct stat statinfo;
3605
3606 int ret = stat(path, &statinfo);
3607
3608 *exists = !ret;
3609
3610 if (!ret) {
3611 /* file exists, everything OK */
3612
3613 } else if (errno == ENOENT || errno == ENOTDIR
3614 || errno == ENAMETOOLONG) {
3615 /* file does not exist */
3616 return(true);
3617
3618 } else {
3619 /* file exists, but stat call failed */
3620 os_file_handle_error_no_exit(path, "stat", false);
3621 return(false);
3622 }
3623
3624 if (S_ISDIR(statinfo.st_mode)) {
3625 *type = OS_FILE_TYPE_DIR;
3626
3627 } else if (S_ISLNK(statinfo.st_mode)) {
3628 *type = OS_FILE_TYPE_LINK;
3629
3630 } else if (S_ISREG(statinfo.st_mode)) {
3631 *type = OS_FILE_TYPE_FILE;
3632
3633 } else {
3634 *type = OS_FILE_TYPE_UNKNOWN;
3635 }
3636
3637 return(true);
3638 }
3639
3640 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3641 function!
3642 Flushes the write buffers of a given file to the disk.
3643 @param[in] file handle to a file
3644 @return true if success */
3645 bool
os_file_flush_func(os_file_t file)3646 os_file_flush_func(
3647 os_file_t file)
3648 {
3649 int ret;
3650
3651 ret = os_file_fsync_posix(file);
3652
3653 if (ret == 0) {
3654 return(true);
3655 }
3656
3657 /* Since Linux returns EINVAL if the 'file' is actually a raw device,
3658 we choose to ignore that error if we are using raw disks */
3659
3660 if (srv_start_raw_disk_in_use && errno == EINVAL) {
3661
3662 return(true);
3663 }
3664
3665 ib::error() << "The OS said file flush did not succeed";
3666
3667 os_file_handle_error(NULL, "flush");
3668
3669 /* It is a fatal error if a file flush does not succeed, because then
3670 the database can get corrupt on disk */
3671 ut_error;
3672
3673 return(false);
3674 }
3675
3676 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
3677 this function!
3678 A simple function to open or create a file.
3679 @param[in] name name of the file or path as a null-terminated
3680 string
3681 @param[in] create_mode create mode
3682 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
3683 @param[in] read_only if true, read only checks are enforced
3684 @param[out] success true if succeed, false if error
3685 @return handle to the file, not defined if error, error number
3686 can be retrieved with os_file_get_last_error */
3687 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)3688 os_file_create_simple_func(
3689 const char* name,
3690 ulint create_mode,
3691 ulint access_type,
3692 bool read_only,
3693 bool* success)
3694 {
3695 pfs_os_file_t file;
3696
3697 *success = false;
3698
3699 int create_flag;
3700
3701 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
3702 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
3703
3704 int create_o_sync;
3705 if (create_mode & OS_FILE_O_SYNC) {
3706
3707 create_o_sync = O_SYNC;
3708 create_mode &= ~(static_cast<ulint>(OS_FILE_O_SYNC));
3709 } else {
3710 create_o_sync = 0;
3711 }
3712
3713 if (create_mode == OS_FILE_OPEN) {
3714
3715 if (access_type == OS_FILE_READ_ONLY) {
3716
3717 create_flag = O_RDONLY;
3718
3719 } else if (read_only) {
3720
3721 create_flag = O_RDONLY;
3722
3723 } else {
3724 create_flag = O_RDWR;
3725 }
3726
3727 } else if (read_only) {
3728
3729 create_flag = O_RDONLY;
3730
3731 } else if (create_mode == OS_FILE_CREATE) {
3732
3733 create_flag = O_RDWR | O_CREAT | O_EXCL;
3734
3735 } else if (create_mode == OS_FILE_CREATE_PATH) {
3736
3737 /* Create subdirs along the path if needed. */
3738
3739 *success = os_file_create_subdirs_if_needed(name);
3740
3741 if (!*success) {
3742
3743 ib::error()
3744 << "Unable to create subdirectories '"
3745 << name << "'";
3746
3747 file.m_file = OS_FILE_CLOSED;
3748 return(file);
3749 }
3750
3751 create_flag = O_RDWR | O_CREAT | O_EXCL;
3752 create_mode = OS_FILE_CREATE;
3753 } else {
3754
3755 ib::error()
3756 << "Unknown file create mode ("
3757 << create_mode
3758 << " for file '" << name << "'";
3759
3760 file.m_file = OS_FILE_CLOSED;
3761 return(file);
3762 }
3763
3764 bool retry;
3765
3766 do {
3767 file.m_file = ::open(name, create_flag | create_o_sync,
3768 os_innodb_umask);
3769
3770 if (file.m_file == -1) {
3771 *success = false;
3772
3773 retry = os_file_handle_error(
3774 name,
3775 create_mode == OS_FILE_OPEN
3776 ? "open" : "create");
3777 } else {
3778 *success = true;
3779 retry = false;
3780 }
3781
3782 } while (retry);
3783
3784 #ifdef USE_FILE_LOCK
3785 if (!read_only
3786 && *success
3787 && access_type == OS_FILE_READ_WRITE
3788 && os_file_lock(file.m_file, name)) {
3789
3790 *success = false;
3791 close(file.m_file);
3792 file.m_file = -1;
3793 }
3794 #endif /* USE_FILE_LOCK */
3795
3796 return(file);
3797 }
3798
3799 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
3800 function!
3801 Truncates a file at the specified position.
3802 @param[in] file file to truncate
3803 @param[in] new_len new file length
3804 @return true if success */
3805 bool
os_file_set_eof_at_func(os_file_t file,ib_uint64_t new_len)3806 os_file_set_eof_at_func(
3807 os_file_t file,
3808 ib_uint64_t new_len)
3809 {
3810 #ifdef __WIN__
3811 LARGE_INTEGER li, li2;
3812 li.QuadPart = new_len;
3813 return(SetFilePointerEx(file, li, &li2,FILE_BEGIN)
3814 && SetEndOfFile(file));
3815 #else
3816 /* TODO: works only with -D_FILE_OFFSET_BITS=64 ? */
3817 return(!ftruncate(file, new_len));
3818 #endif
3819 }
3820
3821 /** This function attempts to create a directory named pathname. The new
3822 directory gets default permissions. On Unix the permissions are
3823 (0770 & ~umask). If the directory exists already, nothing is done and
3824 the call succeeds, unless the fail_if_exists arguments is true.
3825 If another error occurs, such as a permission error, this does not crash,
3826 but reports the error and returns false.
3827 @param[in] pathname directory name as null-terminated string
3828 @param[in] fail_if_exists if true, pre-existing directory is treated as
3829 an error.
3830 @return true if call succeeds, false on error */
3831 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)3832 os_file_create_directory(
3833 const char* pathname,
3834 bool fail_if_exists)
3835 {
3836 int rcode = mkdir(pathname, 0770);
3837
3838 if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
3839 /* failure */
3840 os_file_handle_error_no_exit(pathname, "mkdir", false);
3841
3842 return(false);
3843 }
3844
3845 return(true);
3846 }
3847
3848 /**
3849 The os_file_opendir() function opens a directory stream corresponding to the
3850 directory named by the dirname argument. The directory stream is positioned
3851 at the first entry. In both Unix and Windows we automatically skip the '.'
3852 and '..' items at the start of the directory listing.
3853 @param[in] dirname directory name; it must not contain a trailing
3854 '\' or '/'
3855 @param[in] is_fatal true if we should treat an error as a fatal
3856 error; if we try to open symlinks then we do
3857 not wish a fatal error if it happens not to be
3858 a directory
3859 @return directory stream, NULL if error */
3860 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)3861 os_file_opendir(
3862 const char* dirname,
3863 bool error_is_fatal)
3864 {
3865 os_file_dir_t dir;
3866 dir = opendir(dirname);
3867
3868 if (dir == NULL && error_is_fatal) {
3869 os_file_handle_error(dirname, "opendir");
3870 }
3871
3872 return(dir);
3873 }
3874
3875 /** Closes a directory stream.
3876 @param[in] dir directory stream
3877 @return 0 if success, -1 if failure */
3878 int
os_file_closedir(os_file_dir_t dir)3879 os_file_closedir(
3880 os_file_dir_t dir)
3881 {
3882 int ret = closedir(dir);
3883
3884 if (ret != 0) {
3885 os_file_handle_error_no_exit(NULL, "closedir", false);
3886 }
3887
3888 return(ret);
3889 }
3890
3891 /** This function returns information of the next file in the directory. We jump
3892 over the '.' and '..' entries in the directory.
3893 @param[in] dirname directory name or path
3894 @param[in] dir directory stream
3895 @param[out] info buffer where the info is returned
3896 @return 0 if ok, -1 if error, 1 if at the end of the directory */
3897 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)3898 os_file_readdir_next_file(
3899 const char* dirname,
3900 os_file_dir_t dir,
3901 os_file_stat_t* info)
3902 {
3903 struct dirent* ent;
3904 char* full_path;
3905 int ret;
3906 struct stat statinfo;
3907
3908 #ifdef HAVE_READDIR_R
3909 char dirent_buf[sizeof(struct dirent)
3910 + _POSIX_PATH_MAX + 100];
3911 /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
3912 the max file name len; but in most standards, the
3913 length is NAME_MAX; we add 100 to be even safer */
3914 #endif /* HAVE_READDIR_R */
3915
3916 next_file:
3917
3918 #ifdef HAVE_READDIR_R
3919 ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent);
3920
3921 if (ret != 0) {
3922
3923 ib::error()
3924 << "Cannot read directory " << dirname
3925 << " error: " << ret;
3926
3927 return(-1);
3928 }
3929
3930 if (ent == NULL) {
3931 /* End of directory */
3932
3933 return(1);
3934 }
3935
3936 ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
3937 #else
3938 ent = readdir(dir);
3939
3940 if (ent == NULL) {
3941
3942 return(1);
3943 }
3944 #endif /* HAVE_READDIR_R */
3945 ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
3946
3947 if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
3948
3949 goto next_file;
3950 }
3951
3952 strcpy(info->name, ent->d_name);
3953
3954 full_path = static_cast<char*>(
3955 ut_malloc_nokey(strlen(dirname) + strlen(ent->d_name) + 10));
3956
3957 sprintf(full_path, "%s/%s", dirname, ent->d_name);
3958
3959 ret = stat(full_path, &statinfo);
3960
3961 if (ret) {
3962
3963 if (errno == ENOENT) {
3964 /* readdir() returned a file that does not exist,
3965 it must have been deleted in the meantime. Do what
3966 would have happened if the file was deleted before
3967 readdir() - ignore and go to the next entry.
3968 If this is the last entry then info->name will still
3969 contain the name of the deleted file when this
3970 function returns, but this is not an issue since the
3971 caller shouldn't be looking at info when end of
3972 directory is returned. */
3973
3974 ut_free(full_path);
3975
3976 goto next_file;
3977 }
3978
3979 os_file_handle_error_no_exit(full_path, "stat", false);
3980
3981 ut_free(full_path);
3982
3983 return(-1);
3984 }
3985
3986 info->size = statinfo.st_size;
3987
3988 if (S_ISDIR(statinfo.st_mode)) {
3989 info->type = OS_FILE_TYPE_DIR;
3990 } else if (S_ISLNK(statinfo.st_mode)) {
3991 info->type = OS_FILE_TYPE_LINK;
3992 } else if (S_ISREG(statinfo.st_mode)) {
3993 info->type = OS_FILE_TYPE_FILE;
3994 } else {
3995 info->type = OS_FILE_TYPE_UNKNOWN;
3996 }
3997
3998 ut_free(full_path);
3999
4000 return(0);
4001 }
4002
4003 /** NOTE! Use the corresponding macro os_file_create(), not directly
4004 this function!
4005 Opens an existing file or creates a new.
4006 @param[in] name name of the file or path as a null-terminated
4007 string
4008 @param[in] create_mode create mode
4009 @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
4010 is desired, OS_FILE_NORMAL, if any normal file;
4011 NOTE that it also depends on type, os_aio_..
4012 and srv_.. variables whether we really use async
4013 I/O or unbuffered I/O: look in the function
4014 source code for the exact rules
4015 @param[in] type OS_DATA_FILE or OS_LOG_FILE
4016 @param[in] read_only true, if read only checks should be enforcedm
4017 @param[in] success true if succeeded
4018 @return handle to the file, not defined if error, error number
4019 can be retrieved with os_file_get_last_error */
4020 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)4021 os_file_create_func(
4022 const char* name,
4023 ulint create_mode,
4024 ulint purpose,
4025 ulint type,
4026 bool read_only,
4027 bool* success)
4028 {
4029 bool on_error_no_exit;
4030 bool on_error_silent;
4031 pfs_os_file_t file;
4032
4033 *success = false;
4034
4035 DBUG_EXECUTE_IF(
4036 "ib_create_table_fail_disk_full",
4037 *success = false;
4038 errno = ENOSPC;
4039 file.m_file = OS_FILE_CLOSED;
4040 return(file);
4041 );
4042
4043 int create_flag;
4044 const char* mode_str = NULL;
4045
4046 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
4047 ? true : false;
4048 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
4049 ? true : false;
4050
4051 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
4052 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
4053
4054 if (create_mode == OS_FILE_OPEN
4055 || create_mode == OS_FILE_OPEN_RAW
4056 || create_mode == OS_FILE_OPEN_RETRY) {
4057
4058 mode_str = "OPEN";
4059
4060 create_flag = read_only ? O_RDONLY : O_RDWR;
4061
4062 } else if (read_only) {
4063
4064 mode_str = "OPEN";
4065
4066 create_flag = O_RDONLY;
4067
4068 } else if (create_mode == OS_FILE_CREATE) {
4069
4070 mode_str = "CREATE";
4071 create_flag = O_RDWR | O_CREAT | O_EXCL;
4072
4073 } else if (create_mode == OS_FILE_OVERWRITE) {
4074
4075 mode_str = "OVERWRITE";
4076 create_flag = O_RDWR | O_CREAT | O_TRUNC;
4077
4078 } else {
4079 ib::error()
4080 << "Unknown file create mode (" << create_mode << ")"
4081 << " for file '" << name << "'";
4082
4083 file.m_file = OS_FILE_CLOSED;
4084 return(file);
4085 }
4086
4087 ut_a(type == OS_LOG_FILE
4088 || type == OS_DATA_FILE
4089 || type == OS_DATA_TEMP_FILE);
4090
4091 ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL);
4092
4093 #ifdef O_SYNC
4094 /* We let O_SYNC only affect log files; note that we map O_DSYNC to
4095 O_SYNC because the datasync options seemed to corrupt files in 2001
4096 in both Linux and Solaris */
4097
4098 if (!read_only
4099 && type == OS_LOG_FILE
4100 && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
4101
4102 create_flag |= O_SYNC;
4103 }
4104 #endif /* O_SYNC */
4105
4106 bool retry;
4107
4108 do {
4109 file.m_file = ::open(name, create_flag, os_innodb_umask);
4110
4111 if (file.m_file == -1) {
4112 const char* operation;
4113
4114 operation = (create_mode == OS_FILE_CREATE
4115 && !read_only) ? "create" : "open";
4116
4117 *success = false;
4118
4119 if (on_error_no_exit) {
4120 retry = os_file_handle_error_no_exit(
4121 name, operation, on_error_silent);
4122 } else {
4123 retry = os_file_handle_error(name, operation);
4124 }
4125 } else {
4126 *success = true;
4127 retry = false;
4128 }
4129
4130 } while (retry);
4131
4132 /* We disable OS caching (O_DIRECT) only on data files */
4133
4134 if (!read_only
4135 && *success
4136 && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE)
4137 && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
4138 || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) {
4139
4140 os_file_set_nocache(file.m_file, name, mode_str);
4141 } else if (!srv_read_only_mode
4142 && *success
4143 && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
4144 /* Do fsync() on log and files when setting O_DIRECT fails.
4145 See log_io_complete() */
4146 if (!os_file_set_nocache(file.m_file, name, mode_str)) {
4147 srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
4148 }
4149 }
4150
4151 #ifdef USE_FILE_LOCK
4152 if (!read_only
4153 && *success
4154 && create_mode != OS_FILE_OPEN_RAW
4155 && os_file_lock(file.m_file, name)) {
4156
4157 if (create_mode == OS_FILE_OPEN_RETRY) {
4158
4159 ib::info()
4160 << "Retrying to lock the first data file";
4161
4162 for (int i = 0; i < 100; i++) {
4163 os_thread_sleep(1000000);
4164
4165 if (!os_file_lock(file.m_file, name)) {
4166 *success = true;
4167 return(file);
4168 }
4169 }
4170
4171 ib::info()
4172 << "Unable to open the first data file";
4173 }
4174
4175 *success = false;
4176 close(file.m_file);
4177 file.m_file = -1;
4178 }
4179 #endif /* USE_FILE_LOCK */
4180
4181 return(file);
4182 }
4183
4184 /** NOTE! Use the corresponding macro
4185 os_file_create_simple_no_error_handling(), not directly this function!
4186 A simple function to open or create a file.
4187 @param[in] name name of the file or path as a null-terminated
4188 string
4189 @param[in] create_mode create mode
4190 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
4191 OS_FILE_READ_ALLOW_DELETE; the last option
4192 is used by a backup program reading the file
4193 @param[in] read_only if true read only mode checks are enforced
4194 @param[out] success true if succeeded
4195 @return own: handle to the file, not defined if error, error number
4196 can be retrieved with os_file_get_last_error */
4197 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4198 os_file_create_simple_no_error_handling_func(
4199 const char* name,
4200 ulint create_mode,
4201 ulint access_type,
4202 bool read_only,
4203 bool* success)
4204 {
4205 pfs_os_file_t file;
4206 int create_flag;
4207
4208 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4209 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4210
4211 *success = false;
4212
4213 if (create_mode == OS_FILE_OPEN) {
4214
4215 if (access_type == OS_FILE_READ_ONLY) {
4216
4217 create_flag = O_RDONLY;
4218
4219 } else if (read_only) {
4220
4221 create_flag = O_RDONLY;
4222
4223 } else {
4224
4225 ut_a(access_type == OS_FILE_READ_WRITE
4226 || access_type == OS_FILE_READ_ALLOW_DELETE);
4227
4228 create_flag = O_RDWR;
4229 }
4230
4231 } else if (read_only) {
4232
4233 create_flag = O_RDONLY;
4234
4235 } else if (create_mode == OS_FILE_CREATE) {
4236
4237 create_flag = O_RDWR | O_CREAT | O_EXCL;
4238
4239 } else {
4240
4241 ib::error()
4242 << "Unknown file create mode "
4243 << create_mode << " for file '" << name << "'";
4244 file.m_file = OS_FILE_CLOSED;
4245 return(file);
4246 }
4247
4248 file.m_file = ::open(name, create_flag, os_innodb_umask);
4249
4250 *success = (file.m_file != -1);
4251
4252 #ifdef USE_FILE_LOCK
4253 if (!read_only
4254 && *success
4255 && access_type == OS_FILE_READ_WRITE
4256 && os_file_lock(file.m_file, name)) {
4257
4258 *success = false;
4259 close(file.m_file);
4260 file.m_file = -1;
4261
4262 }
4263 #endif /* USE_FILE_LOCK */
4264
4265 return(file);
4266 }
4267
4268 /** Deletes a file if it exists. The file has to be closed before calling this.
4269 @param[in] name file path as a null-terminated string
4270 @param[out] exist indicate if file pre-exist
4271 @return true if success */
4272 bool
os_file_delete_if_exists_func(const char * name,bool * exist)4273 os_file_delete_if_exists_func(
4274 const char* name,
4275 bool* exist)
4276 {
4277 if (exist != NULL) {
4278 *exist = true;
4279 }
4280
4281 int ret = unlink(name);
4282
4283 if (ret != 0 && errno == ENOENT) {
4284 if (exist != NULL) {
4285 *exist = false;
4286 }
4287 } else if (ret != 0 && errno != ENOENT) {
4288 os_file_handle_error_no_exit(name, "delete", false);
4289
4290 return(false);
4291 }
4292
4293 return(true);
4294 }
4295
4296 /** Deletes a file. The file has to be closed before calling this.
4297 @param[in] name file path as a null-terminated string
4298 @return true if success */
4299 bool
os_file_delete_func(const char * name)4300 os_file_delete_func(
4301 const char* name)
4302 {
4303 int ret = unlink(name);
4304
4305 if (ret != 0) {
4306 os_file_handle_error_no_exit(name, "delete", false);
4307
4308 return(false);
4309 }
4310
4311 return(true);
4312 }
4313
4314 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
4315 function!
4316 Renames a file (can also move it to another directory). It is safest that the
4317 file is closed before calling this function.
4318 @param[in] oldpath old file path as a null-terminated string
4319 @param[in] newpath new file path
4320 @return true if success */
4321 bool
os_file_rename_func(const char * oldpath,const char * newpath)4322 os_file_rename_func(
4323 const char* oldpath,
4324 const char* newpath)
4325 {
4326 #ifdef UNIV_DEBUG
4327 os_file_type_t type;
4328 bool exists;
4329
4330 /* New path must not exist. */
4331 ut_ad(os_file_status(newpath, &exists, &type));
4332 ut_ad(!exists);
4333
4334 /* Old path must exist. */
4335 ut_ad(os_file_status(oldpath, &exists, &type));
4336 ut_ad(exists);
4337 #endif /* UNIV_DEBUG */
4338
4339 int ret = rename(oldpath, newpath);
4340
4341 if (ret != 0) {
4342 os_file_handle_error_no_exit(oldpath, "rename", false);
4343
4344 return(false);
4345 }
4346
4347 return(true);
4348 }
4349
4350 /** NOTE! Use the corresponding macro os_file_close(), not directly this
4351 function!
4352 Closes a file handle. In case of error, error number can be retrieved with
4353 os_file_get_last_error.
4354 @param[in] file Handle to close
4355 @return true if success */
4356 bool
os_file_close_func(os_file_t file)4357 os_file_close_func(
4358 os_file_t file)
4359 {
4360 int ret = close(file);
4361
4362 if (ret == -1) {
4363 os_file_handle_error(NULL, "close");
4364
4365 return(false);
4366 }
4367
4368 return(true);
4369 }
4370
4371 /** Announces an intention to access file data in a specific pattern in the
4372 future.
4373 @param[in, own] file handle to a file
4374 @param[in] offset file region offset
4375 @param[in] len file region length
4376 @param[in] advice advice for access pattern
4377 @return true if success */
4378 bool
os_file_advise(pfs_os_file_t file,os_offset_t offset,os_offset_t len,ulint advice)4379 os_file_advise(pfs_os_file_t file, os_offset_t offset, os_offset_t len,
4380 ulint advice)
4381 {
4382 #ifdef __WIN__
4383 return(true);
4384 #else
4385 #ifdef UNIV_LINUX
4386 int native_advice = 0;
4387 if ((advice & OS_FILE_ADVISE_NORMAL) != 0)
4388 native_advice |= POSIX_FADV_NORMAL;
4389 if ((advice & OS_FILE_ADVISE_RANDOM) != 0)
4390 native_advice |= POSIX_FADV_RANDOM;
4391 if ((advice & OS_FILE_ADVISE_SEQUENTIAL) != 0)
4392 native_advice |= POSIX_FADV_SEQUENTIAL;
4393 if ((advice & OS_FILE_ADVISE_WILLNEED) != 0)
4394 native_advice |= POSIX_FADV_WILLNEED;
4395 if ((advice & OS_FILE_ADVISE_DONTNEED) != 0)
4396 native_advice |= POSIX_FADV_DONTNEED;
4397 if ((advice & OS_FILE_ADVISE_NOREUSE) != 0)
4398 native_advice |= POSIX_FADV_NOREUSE;
4399
4400 return(posix_fadvise(file.m_file, offset, len, native_advice) == 0);
4401 #else
4402 return(true);
4403 #endif
4404 #endif /* __WIN__ */
4405 }
4406
4407
4408 /** Gets a file size.
4409 @param[in] file handle to an open file
4410 @return file size, or (os_offset_t) -1 on failure */
4411 os_offset_t
os_file_get_size(pfs_os_file_t file)4412 os_file_get_size(
4413 pfs_os_file_t file)
4414 {
4415 /* Store current position */
4416 os_offset_t pos = lseek(file.m_file, 0, SEEK_CUR);
4417 os_offset_t file_size = lseek(file.m_file, 0, SEEK_END);
4418
4419 /* Restore current position as the function should not change it */
4420 lseek(file.m_file, pos, SEEK_SET);
4421
4422 return(file_size);
4423 }
4424
4425 /** Gets a file size.
4426 @param[in] filename Full path to the filename to check
4427 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
4428 errno */
4429 os_file_size_t
os_file_get_size(const char * filename)4430 os_file_get_size(
4431 const char* filename)
4432 {
4433 struct stat s;
4434 os_file_size_t file_size;
4435
4436 int ret = stat(filename, &s);
4437
4438 if (ret == 0) {
4439 file_size.m_total_size = s.st_size;
4440 /* st_blocks is in 512 byte sized blocks */
4441 file_size.m_alloc_size = s.st_blocks * 512;
4442 } else {
4443 file_size.m_total_size = ~0;
4444 file_size.m_alloc_size = (os_offset_t) errno;
4445 }
4446
4447 return(file_size);
4448 }
4449
4450 /** This function returns information about the specified file
4451 @param[in] path pathname of the file
4452 @param[out] stat_info information of a file in a directory
4453 @param[in,out] statinfo information of a file in a directory
4454 @param[in] check_rw_perm for testing whether the file can be opened
4455 in RW mode
4456 @param[in] read_only if true read only mode checks are enforced
4457 @return DB_SUCCESS if all OK */
4458 static
4459 dberr_t
os_file_get_status_posix(const char * path,os_file_stat_t * stat_info,struct stat * statinfo,bool check_rw_perm,bool read_only)4460 os_file_get_status_posix(
4461 const char* path,
4462 os_file_stat_t* stat_info,
4463 struct stat* statinfo,
4464 bool check_rw_perm,
4465 bool read_only)
4466 {
4467 int ret = stat(path, statinfo);
4468
4469 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
4470 /* file does not exist */
4471
4472 return(DB_NOT_FOUND);
4473
4474 } else if (ret) {
4475 /* file exists, but stat call failed */
4476
4477 os_file_handle_error_no_exit(path, "stat", false);
4478
4479 return(DB_FAIL);
4480 }
4481
4482 switch (statinfo->st_mode & S_IFMT) {
4483 case S_IFDIR:
4484 stat_info->type = OS_FILE_TYPE_DIR;
4485 break;
4486 case S_IFLNK:
4487 stat_info->type = OS_FILE_TYPE_LINK;
4488 break;
4489 case S_IFBLK:
4490 /* Handle block device as regular file. */
4491 case S_IFCHR:
4492 /* Handle character device as regular file. */
4493 case S_IFREG:
4494 stat_info->type = OS_FILE_TYPE_FILE;
4495 break;
4496 default:
4497 stat_info->type = OS_FILE_TYPE_UNKNOWN;
4498 }
4499
4500 stat_info->size = statinfo->st_size;
4501 stat_info->block_size = statinfo->st_blksize;
4502 stat_info->alloc_size = statinfo->st_blocks * 512;
4503
4504 if (check_rw_perm
4505 && (stat_info->type == OS_FILE_TYPE_FILE
4506 || stat_info->type == OS_FILE_TYPE_BLOCK)) {
4507
4508 int access = !read_only ? O_RDWR : O_RDONLY;
4509 int fh = ::open(path, access, os_innodb_umask);
4510
4511 if (fh == -1) {
4512 stat_info->rw_perm = false;
4513 } else {
4514 stat_info->rw_perm = true;
4515 close(fh);
4516 }
4517 }
4518
4519 return(DB_SUCCESS);
4520 }
4521
4522 /** Truncates a file to a specified size in bytes.
4523 Do nothing if the size to preserve is greater or equal to the current
4524 size of the file.
4525 @param[in] pathname file path
4526 @param[in] file file to be truncated
4527 @param[in] size size to preserve in bytes
4528 @return true if success */
4529 static
4530 bool
os_file_truncate_posix(const char * pathname,pfs_os_file_t file,os_offset_t size)4531 os_file_truncate_posix(
4532 const char* pathname,
4533 pfs_os_file_t file,
4534 os_offset_t size)
4535 {
4536 int res = ftruncate(file.m_file, size);
4537 if (res == -1) {
4538
4539 bool retry;
4540
4541 retry = os_file_handle_error_no_exit(
4542 pathname, "truncate", false);
4543
4544 if (retry) {
4545 ib::warn()
4546 << "Truncate failed for '"
4547 << pathname << "'";
4548 }
4549 }
4550
4551 return(res == 0);
4552 }
4553
4554 /** Truncates a file at its current position.
4555 @return true if success */
4556 bool
os_file_set_eof(FILE * file)4557 os_file_set_eof(
4558 FILE* file) /*!< in: file to be truncated */
4559 {
4560 return(!ftruncate(fileno(file), ftell(file)));
4561 }
4562
4563 /** Closes a file handle.
4564 @param[in] file Handle to a file
4565 @return true if success */
4566 bool
os_file_close_no_error_handling_func(os_file_t file)4567 os_file_close_no_error_handling_func(
4568 os_file_t file)
4569 {
4570 return(close(file) != -1);
4571 }
4572
4573 /** This function can be called if one wants to post a batch of reads and
4574 prefers an i/o-handler thread to handle them all at once later. You must
4575 call os_aio_simulated_wake_handler_threads later to ensure the threads
4576 are not left sleeping! */
4577 void
os_aio_simulated_put_read_threads_to_sleep()4578 os_aio_simulated_put_read_threads_to_sleep()
4579 {
4580 /* No op on non Windows */
4581 }
4582
4583 #else /* !_WIN32 */
4584
4585 #include <WinIoCtl.h>
4586
4587 /** Do the read/write
4588 @param[in] request The IO context and type
4589 @return the number of bytes read/written or negative value on error */
4590 ssize_t
execute(const IORequest & request)4591 SyncFileIO::execute(const IORequest& request)
4592 {
4593 OVERLAPPED seek;
4594
4595 memset(&seek, 0x0, sizeof(seek));
4596
4597 seek.Offset = (DWORD) m_offset & 0xFFFFFFFF;
4598 seek.OffsetHigh = (DWORD) (m_offset >> 32);
4599
4600 BOOL ret;
4601 DWORD n_bytes;
4602
4603 if (request.is_read()) {
4604 ret = ReadFile(m_fh, m_buf,
4605 static_cast<DWORD>(m_n), &n_bytes, &seek);
4606
4607 } else {
4608 ut_ad(request.is_write());
4609 ret = WriteFile(m_fh, m_buf,
4610 static_cast<DWORD>(m_n), &n_bytes, &seek);
4611 }
4612
4613 return(ret ? static_cast<ssize_t>(n_bytes) : -1);
4614 }
4615
4616 /** Do the read/write
4617 @param[in,out] slot The IO slot, it has the IO context
4618 @return the number of bytes read/written or negative value on error */
4619 ssize_t
execute(Slot * slot)4620 SyncFileIO::execute(Slot* slot)
4621 {
4622 BOOL ret;
4623
4624 if (slot->type.is_read()) {
4625 ret = ReadFile(
4626 slot->file.m_file, slot->ptr, slot->len,
4627 &slot->n_bytes, &slot->control);
4628 } else {
4629 ut_ad(slot->type.is_write());
4630 ret = WriteFile(
4631 slot->file.m_file, slot->ptr, slot->len,
4632 &slot->n_bytes, &slot->control);
4633 }
4634
4635 return(ret ? static_cast<ssize_t>(slot->n_bytes) : -1);
4636 }
4637
4638 /** Check if the file system supports sparse files.
4639 @param[in] name File name
4640 @return true if the file system supports sparse files */
4641 static
4642 bool
os_is_sparse_file_supported_win32(const char * filename)4643 os_is_sparse_file_supported_win32(const char* filename)
4644 {
4645 char volname[MAX_PATH];
4646 BOOL result = GetVolumePathName(filename, volname, MAX_PATH);
4647
4648 if (!result) {
4649
4650 ib::error()
4651 << "os_is_sparse_file_supported: "
4652 << "Failed to get the volume path name for: "
4653 << filename
4654 << "- OS error number " << GetLastError();
4655
4656 return(false);
4657 }
4658
4659 DWORD flags;
4660
4661 GetVolumeInformation(
4662 volname, NULL, MAX_PATH, NULL, NULL,
4663 &flags, NULL, MAX_PATH);
4664
4665 return(flags & FILE_SUPPORTS_SPARSE_FILES) ? true : false;
4666 }
4667
4668 /** Free storage space associated with a section of the file.
4669 @param[in] fh Open file handle
4670 @param[in] page_size Tablespace page size
4671 @param[in] block_size File system block size
4672 @param[in] off Starting offset (SEEK_SET)
4673 @param[in] len Size of the hole
4674 @return 0 on success or errno */
4675 static
4676 dberr_t
os_file_punch_hole_win32(os_file_t fh,os_offset_t off,os_offset_t len)4677 os_file_punch_hole_win32(
4678 os_file_t fh,
4679 os_offset_t off,
4680 os_offset_t len)
4681 {
4682 FILE_ZERO_DATA_INFORMATION punch;
4683
4684 punch.FileOffset.QuadPart = off;
4685 punch.BeyondFinalZero.QuadPart = off + len;
4686
4687 /* If lpOverlapped is NULL, lpBytesReturned cannot be NULL,
4688 therefore we pass a dummy parameter. */
4689 DWORD temp;
4690
4691 BOOL result = DeviceIoControl(
4692 fh, FSCTL_SET_ZERO_DATA, &punch, sizeof(punch),
4693 NULL, 0, &temp, NULL);
4694
4695 return(!result ? DB_IO_NO_PUNCH_HOLE : DB_SUCCESS);
4696 }
4697
4698 /** Check the existence and type of the given file.
4699 @param[in] path path name of file
4700 @param[out] exists true if the file exists
4701 @param[out] type Type of the file, if it exists
4702 @return true if call succeeded */
4703 bool
os_file_status_win32(const char * path,bool * exists,os_file_type_t * type)4704 os_file_status_win32(
4705 const char* path,
4706 bool* exists,
4707 os_file_type_t* type)
4708 {
4709 int ret;
4710 struct _stat64 statinfo;
4711
4712 ret = _stat64(path, &statinfo);
4713
4714 *exists = !ret;
4715
4716 if (!ret) {
4717 /* file exists, everything OK */
4718
4719 } else if (errno == ENOENT || errno == ENOTDIR
4720 || errno == ENAMETOOLONG) {
4721 /* file does not exist */
4722 return(true);
4723
4724 } else {
4725 /* file exists, but stat call failed */
4726 os_file_handle_error_no_exit(path, "stat", false);
4727 return(false);
4728 }
4729
4730 if (_S_IFDIR & statinfo.st_mode) {
4731 *type = OS_FILE_TYPE_DIR;
4732
4733 } else if (_S_IFREG & statinfo.st_mode) {
4734 *type = OS_FILE_TYPE_FILE;
4735
4736 } else {
4737 *type = OS_FILE_TYPE_UNKNOWN;
4738 }
4739
4740 return(true);
4741 }
4742
4743 /** NOTE! Use the corresponding macro os_file_flush(), not directly this
4744 function!
4745 Flushes the write buffers of a given file to the disk.
4746 @param[in] file handle to a file
4747 @return true if success */
4748 bool
os_file_flush_func(os_file_t file)4749 os_file_flush_func(
4750 os_file_t file)
4751 {
4752 ++os_n_fsyncs;
4753
4754 BOOL ret = FlushFileBuffers(file);
4755
4756 if (ret) {
4757 return(true);
4758 }
4759
4760 /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
4761 actually a raw device, we choose to ignore that error if we are using
4762 raw disks */
4763
4764 if (srv_start_raw_disk_in_use && GetLastError()
4765 == ERROR_INVALID_FUNCTION) {
4766 return(true);
4767 }
4768
4769 os_file_handle_error(NULL, "flush");
4770
4771 /* It is a fatal error if a file flush does not succeed, because then
4772 the database can get corrupt on disk */
4773 ut_error;
4774
4775 return(false);
4776 }
4777
4778 /** Retrieves the last error number if an error occurs in a file io function.
4779 The number should be retrieved before any other OS calls (because they may
4780 overwrite the error number). If the number is not known to this program,
4781 the OS error number + 100 is returned.
4782 @param[in] report_all_errors true if we want an error message printed
4783 of all errors
4784 @param[in] on_error_silent true then don't print any diagnostic
4785 to the log
4786 @return error number, or OS error number + 100 */
4787 static
4788 ulint
os_file_get_last_error_low(bool report_all_errors,bool on_error_silent)4789 os_file_get_last_error_low(
4790 bool report_all_errors,
4791 bool on_error_silent)
4792 {
4793 ulint err = (ulint) GetLastError();
4794
4795 if (err == ERROR_SUCCESS) {
4796 return(0);
4797 }
4798
4799 if (report_all_errors
4800 || (!on_error_silent
4801 && err != ERROR_DISK_FULL
4802 && err != ERROR_FILE_EXISTS)) {
4803
4804 ib::error()
4805 << "Operating system error number " << err
4806 << " in a file operation.";
4807
4808 if (err == ERROR_PATH_NOT_FOUND) {
4809 ib::error()
4810 << "The error means the system"
4811 " cannot find the path specified.";
4812
4813 if (srv_is_being_started) {
4814 ib::error()
4815 << "If you are installing InnoDB,"
4816 " remember that you must create"
4817 " directories yourself, InnoDB"
4818 " does not create them.";
4819 }
4820
4821 } else if (err == ERROR_ACCESS_DENIED) {
4822
4823 ib::error()
4824 << "The error means mysqld does not have"
4825 " the access rights to"
4826 " the directory. It may also be"
4827 " you have created a subdirectory"
4828 " of the same name as a data file.";
4829
4830 } else if (err == ERROR_SHARING_VIOLATION
4831 || err == ERROR_LOCK_VIOLATION) {
4832
4833 ib::error()
4834 << "The error means that another program"
4835 " is using InnoDB's files."
4836 " This might be a backup or antivirus"
4837 " software or another instance"
4838 " of MySQL."
4839 " Please close it to get rid of this error.";
4840
4841 } else if (err == ERROR_WORKING_SET_QUOTA
4842 || err == ERROR_NO_SYSTEM_RESOURCES) {
4843
4844 ib::error()
4845 << "The error means that there are no"
4846 " sufficient system resources or quota to"
4847 " complete the operation.";
4848
4849 } else if (err == ERROR_OPERATION_ABORTED) {
4850
4851 ib::error()
4852 << "The error means that the I/O"
4853 " operation has been aborted"
4854 " because of either a thread exit"
4855 " or an application request."
4856 " Retry attempt is made.";
4857 } else {
4858
4859 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
4860 }
4861 }
4862
4863 if (err == ERROR_FILE_NOT_FOUND) {
4864 return(OS_FILE_NOT_FOUND);
4865 } else if (err == ERROR_DISK_FULL) {
4866 return(OS_FILE_DISK_FULL);
4867 } else if (err == ERROR_FILE_EXISTS) {
4868 return(OS_FILE_ALREADY_EXISTS);
4869 } else if (err == ERROR_SHARING_VIOLATION
4870 || err == ERROR_LOCK_VIOLATION) {
4871 return(OS_FILE_SHARING_VIOLATION);
4872 } else if (err == ERROR_WORKING_SET_QUOTA
4873 || err == ERROR_NO_SYSTEM_RESOURCES) {
4874 return(OS_FILE_INSUFFICIENT_RESOURCE);
4875 } else if (err == ERROR_OPERATION_ABORTED) {
4876 return(OS_FILE_OPERATION_ABORTED);
4877 } else if (err == ERROR_ACCESS_DENIED) {
4878 return(OS_FILE_ACCESS_VIOLATION);
4879 }
4880
4881 return(OS_FILE_ERROR_MAX + err);
4882 }
4883
4884 /** NOTE! Use the corresponding macro os_file_create_simple(), not directly
4885 this function!
4886 A simple function to open or create a file.
4887 @param[in] name name of the file or path as a null-terminated
4888 string
4889 @param[in] create_mode create mode
4890 @param[in] access_type OS_FILE_READ_ONLY or OS_FILE_READ_WRITE
4891 @param[in] read_only if true read only mode checks are enforced
4892 @param[out] success true if succeed, false if error
4893 @return handle to the file, not defined if error, error number
4894 can be retrieved with os_file_get_last_error */
4895 pfs_os_file_t
os_file_create_simple_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)4896 os_file_create_simple_func(
4897 const char* name,
4898 ulint create_mode,
4899 ulint access_type,
4900 bool read_only,
4901 bool* success)
4902 {
4903 pfs_os_file_t file;
4904
4905 *success = false;
4906
4907 DWORD access;
4908 DWORD create_flag;
4909 DWORD attributes = 0;
4910
4911 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
4912 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
4913
4914 if (create_mode == OS_FILE_OPEN) {
4915
4916 create_flag = OPEN_EXISTING;
4917
4918 } else if (read_only) {
4919
4920 create_flag = OPEN_EXISTING;
4921
4922 } else if (create_mode == OS_FILE_CREATE) {
4923
4924 create_flag = CREATE_NEW;
4925
4926 } else if (create_mode == OS_FILE_CREATE_PATH) {
4927
4928 /* Create subdirs along the path if needed. */
4929 *success = os_file_create_subdirs_if_needed(name);
4930
4931 if (!*success) {
4932
4933 ib::error()
4934 << "Unable to create subdirectories '"
4935 << name << "'";
4936 file.m_file = OS_FILE_CLOSED;
4937 return(file);
4938 }
4939
4940 create_flag = CREATE_NEW;
4941 create_mode = OS_FILE_CREATE;
4942
4943 } else {
4944
4945 ib::error()
4946 << "Unknown file create mode ("
4947 << create_mode << ") for file '"
4948 << name << "'";
4949
4950 file.m_file = OS_FILE_CLOSED;
4951 return(file);
4952 }
4953
4954 if (access_type == OS_FILE_READ_ONLY) {
4955
4956 access = GENERIC_READ;
4957
4958 } else if (read_only) {
4959
4960 ib::info()
4961 << "Read only mode set. Unable to"
4962 " open file '" << name << "' in RW mode, "
4963 << "trying RO mode", name;
4964
4965 access = GENERIC_READ;
4966
4967 } else if (access_type == OS_FILE_READ_WRITE) {
4968
4969 access = GENERIC_READ | GENERIC_WRITE;
4970
4971 } else {
4972
4973 ib::error()
4974 << "Unknown file access type (" << access_type << ") "
4975 "for file '" << name << "'";
4976
4977 file.m_file = OS_FILE_CLOSED;
4978 return(file);
4979 }
4980
4981 bool retry;
4982
4983 do {
4984 /* Use default security attributes and no template file. */
4985
4986 file.m_file = CreateFile(
4987 (LPCTSTR) name, access, FILE_SHARE_READ, NULL,
4988 create_flag, attributes, NULL);
4989
4990 if (file.m_file == INVALID_HANDLE_VALUE) {
4991
4992 *success = false;
4993
4994 retry = os_file_handle_error(
4995 name, create_mode == OS_FILE_OPEN ?
4996 "open" : "create");
4997
4998 } else {
4999
5000 retry = false;
5001
5002 *success = true;
5003
5004 DWORD temp;
5005
5006 /* This is a best effort use case, if it fails then
5007 we will find out when we try and punch the hole. */
5008
5009 DeviceIoControl(
5010 file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
5011 &temp, NULL);
5012 }
5013
5014 } while (retry);
5015
5016 return(file);
5017 }
5018
5019 /** This function attempts to create a directory named pathname. The new
5020 directory gets default permissions. On Unix the permissions are
5021 (0770 & ~umask). If the directory exists already, nothing is done and
5022 the call succeeds, unless the fail_if_exists arguments is true.
5023 If another error occurs, such as a permission error, this does not crash,
5024 but reports the error and returns false.
5025 @param[in] pathname directory name as null-terminated string
5026 @param[in] fail_if_exists if true, pre-existing directory is treated
5027 as an error.
5028 @return true if call succeeds, false on error */
5029 bool
os_file_create_directory(const char * pathname,bool fail_if_exists)5030 os_file_create_directory(
5031 const char* pathname,
5032 bool fail_if_exists)
5033 {
5034 BOOL rcode;
5035
5036 rcode = CreateDirectory((LPCTSTR) pathname, NULL);
5037 if (!(rcode != 0
5038 || (GetLastError() == ERROR_ALREADY_EXISTS
5039 && !fail_if_exists))) {
5040
5041 os_file_handle_error_no_exit(
5042 pathname, "CreateDirectory", false);
5043
5044 return(false);
5045 }
5046
5047 return(true);
5048 }
5049
5050 /** The os_file_opendir() function opens a directory stream corresponding to the
5051 directory named by the dirname argument. The directory stream is positioned
5052 at the first entry. In both Unix and Windows we automatically skip the '.'
5053 and '..' items at the start of the directory listing.
5054 @param[in] dirname directory name; it must not contain a trailing
5055 '\' or '/'
5056 @param[in] is_fatal true if we should treat an error as a fatal
5057 error; if we try to open symlinks then we do
5058 not wish a fatal error if it happens not to
5059 be a directory
5060 @return directory stream, NULL if error */
5061 os_file_dir_t
os_file_opendir(const char * dirname,bool error_is_fatal)5062 os_file_opendir(
5063 const char* dirname,
5064 bool error_is_fatal)
5065 {
5066 os_file_dir_t dir;
5067 LPWIN32_FIND_DATA lpFindFileData;
5068 char path[OS_FILE_MAX_PATH + 3];
5069
5070 ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
5071
5072 strcpy(path, dirname);
5073 strcpy(path + strlen(path), "\\*");
5074
5075 /* Note that in Windows opening the 'directory stream' also retrieves
5076 the first entry in the directory. Since it is '.', that is no problem,
5077 as we will skip over the '.' and '..' entries anyway. */
5078
5079 lpFindFileData = static_cast<LPWIN32_FIND_DATA>(
5080 ut_malloc_nokey(sizeof(WIN32_FIND_DATA)));
5081
5082 dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
5083
5084 ut_free(lpFindFileData);
5085
5086 if (dir == INVALID_HANDLE_VALUE) {
5087
5088 if (error_is_fatal) {
5089 os_file_handle_error(dirname, "opendir");
5090 }
5091
5092 return(NULL);
5093 }
5094
5095 return(dir);
5096 }
5097
5098 /** Closes a directory stream.
5099 @param[in] dir directory stream
5100 @return 0 if success, -1 if failure */
5101 int
os_file_closedir(os_file_dir_t dir)5102 os_file_closedir(
5103 os_file_dir_t dir)
5104 {
5105 BOOL ret;
5106
5107 ret = FindClose(dir);
5108
5109 if (!ret) {
5110 os_file_handle_error_no_exit(NULL, "closedir", false);
5111
5112 return(-1);
5113 }
5114
5115 return(0);
5116 }
5117
5118 /** This function returns information of the next file in the directory. We
5119 jump over the '.' and '..' entries in the directory.
5120 @param[in] dirname directory name or path
5121 @param[in] dir directory stream
5122 @param[out] info buffer where the info is returned
5123 @return 0 if ok, -1 if error, 1 if at the end of the directory */
5124 int
os_file_readdir_next_file(const char * dirname,os_file_dir_t dir,os_file_stat_t * info)5125 os_file_readdir_next_file(
5126 const char* dirname,
5127 os_file_dir_t dir,
5128 os_file_stat_t* info)
5129 {
5130 BOOL ret;
5131 int status;
5132 WIN32_FIND_DATA find_data;
5133
5134 next_file:
5135
5136 ret = FindNextFile(dir, &find_data);
5137
5138 if (ret > 0) {
5139
5140 const char* name;
5141
5142 name = static_cast<const char*>(find_data.cFileName);
5143
5144 ut_a(strlen(name) < OS_FILE_MAX_PATH);
5145
5146 if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
5147
5148 goto next_file;
5149 }
5150
5151 strcpy(info->name, name);
5152
5153 info->size = find_data.nFileSizeHigh;
5154 info->size <<= 32;
5155 info->size |= find_data.nFileSizeLow;
5156
5157 if (find_data.dwFileAttributes
5158 & FILE_ATTRIBUTE_REPARSE_POINT) {
5159
5160 /* TODO: test Windows symlinks */
5161 /* TODO: MySQL has apparently its own symlink
5162 implementation in Windows, dbname.sym can
5163 redirect a database directory:
5164 REFMAN "windows-symbolic-links.html" */
5165
5166 info->type = OS_FILE_TYPE_LINK;
5167
5168 } else if (find_data.dwFileAttributes
5169 & FILE_ATTRIBUTE_DIRECTORY) {
5170
5171 info->type = OS_FILE_TYPE_DIR;
5172
5173 } else {
5174
5175 /* It is probably safest to assume that all other
5176 file types are normal. Better to check them rather
5177 than blindly skip them. */
5178
5179 info->type = OS_FILE_TYPE_FILE;
5180 }
5181
5182 status = 0;
5183
5184 } else if (GetLastError() == ERROR_NO_MORE_FILES) {
5185
5186 status = 1;
5187
5188 } else {
5189
5190 os_file_handle_error_no_exit(NULL, "readdir_next_file", false);
5191
5192 status = -1;
5193 }
5194
5195 return(status);
5196 }
5197
5198 /** NOTE! Use the corresponding macro os_file_create(), not directly
5199 this function!
5200 Opens an existing file or creates a new.
5201 @param[in] name name of the file or path as a null-terminated
5202 string
5203 @param[in] create_mode create mode
5204 @param[in] purpose OS_FILE_AIO, if asynchronous, non-buffered I/O
5205 is desired, OS_FILE_NORMAL, if any normal file;
5206 NOTE that it also depends on type, os_aio_..
5207 and srv_.. variables whether we really use async
5208 I/O or unbuffered I/O: look in the function
5209 source code for the exact rules
5210 @param[in] type OS_DATA_FILE or OS_LOG_FILE
5211 @param[in] success true if succeeded
5212 @return handle to the file, not defined if error, error number
5213 can be retrieved with os_file_get_last_error */
5214 pfs_os_file_t
os_file_create_func(const char * name,ulint create_mode,ulint purpose,ulint type,bool read_only,bool * success)5215 os_file_create_func(
5216 const char* name,
5217 ulint create_mode,
5218 ulint purpose,
5219 ulint type,
5220 bool read_only,
5221 bool* success)
5222 {
5223 pfs_os_file_t file;
5224 bool retry;
5225 bool on_error_no_exit;
5226 bool on_error_silent;
5227
5228 *success = false;
5229
5230 DBUG_EXECUTE_IF(
5231 "ib_create_table_fail_disk_full",
5232 *success = false;
5233 SetLastError(ERROR_DISK_FULL);
5234 file.m_file = OS_FILE_CLOSED;
5235 return(file);
5236 );
5237
5238 DWORD create_flag;
5239 DWORD share_mode = FILE_SHARE_READ;
5240
5241 on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
5242 ? true : false;
5243
5244 on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT
5245 ? true : false;
5246
5247 create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT;
5248 create_mode &= ~OS_FILE_ON_ERROR_SILENT;
5249
5250 if (create_mode == OS_FILE_OPEN_RAW) {
5251
5252 ut_a(!read_only);
5253
5254 create_flag = OPEN_EXISTING;
5255
5256 /* On Windows Physical devices require admin privileges and
5257 have to have the write-share mode set. See the remarks
5258 section for the CreateFile() function documentation in MSDN. */
5259
5260 share_mode |= FILE_SHARE_WRITE;
5261
5262 } else if (create_mode == OS_FILE_OPEN
5263 || create_mode == OS_FILE_OPEN_RETRY) {
5264
5265 create_flag = OPEN_EXISTING;
5266
5267 } else if (read_only) {
5268
5269 create_flag = OPEN_EXISTING;
5270
5271 } else if (create_mode == OS_FILE_CREATE) {
5272
5273 create_flag = CREATE_NEW;
5274
5275 } else if (create_mode == OS_FILE_OVERWRITE) {
5276
5277 create_flag = CREATE_ALWAYS;
5278
5279 } else {
5280 ib::error()
5281 << "Unknown file create mode (" << create_mode << ") "
5282 << " for file '" << name << "'";
5283
5284 file.m_file = OS_FILE_CLOSED;
5285 return(file);
5286 }
5287
5288 DWORD attributes = 0;
5289
5290 #ifdef UNIV_HOTBACKUP
5291 attributes |= FILE_FLAG_NO_BUFFERING;
5292 #else
5293 if (purpose == OS_FILE_AIO) {
5294
5295 #ifdef WIN_ASYNC_IO
5296 /* If specified, use asynchronous (overlapped) io and no
5297 buffering of writes in the OS */
5298
5299 if (srv_use_native_aio) {
5300 attributes |= FILE_FLAG_OVERLAPPED;
5301 }
5302 #endif /* WIN_ASYNC_IO */
5303
5304 } else if (purpose == OS_FILE_NORMAL) {
5305
5306 /* Use default setting. */
5307
5308 } else {
5309
5310 ib::error()
5311 << "Unknown purpose flag (" << purpose << ") "
5312 << "while opening file '" << name << "'";
5313
5314 file.m_file = OS_FILE_CLOSED;
5315 return(file);
5316 }
5317
5318 #ifdef UNIV_NON_BUFFERED_IO
5319 // TODO: Create a bug, this looks wrong. The flush log
5320 // parameter is dynamic.
5321 if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
5322
5323 /* Do not use unbuffered i/o for the log files because
5324 value 2 denotes that we do not flush the log at every
5325 commit, but only once per second */
5326
5327 } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) {
5328
5329 attributes |= FILE_FLAG_NO_BUFFERING;
5330 }
5331 #endif /* UNIV_NON_BUFFERED_IO */
5332
5333 #endif /* UNIV_HOTBACKUP */
5334 DWORD access = GENERIC_READ;
5335
5336 if (!read_only) {
5337 access |= GENERIC_WRITE;
5338 }
5339
5340 do {
5341 /* Use default security attributes and no template file. */
5342 file.m_file = CreateFile(
5343 (LPCTSTR) name, access, share_mode, NULL,
5344 create_flag, attributes, NULL);
5345
5346 if (file.m_file == INVALID_HANDLE_VALUE) {
5347 const char* operation;
5348
5349 operation = (create_mode == OS_FILE_CREATE
5350 && !read_only)
5351 ? "create" : "open";
5352
5353 *success = false;
5354
5355 if (on_error_no_exit) {
5356 retry = os_file_handle_error_no_exit(
5357 name, operation, on_error_silent);
5358 } else {
5359 retry = os_file_handle_error(name, operation);
5360 }
5361 } else {
5362
5363 retry = false;
5364
5365 *success = true;
5366
5367 DWORD temp;
5368
5369 /* This is a best effort use case, if it fails then
5370 we will find out when we try and punch the hole. */
5371 DeviceIoControl(
5372 file.m_file, FSCTL_SET_SPARSE, NULL, 0, NULL, 0,
5373 &temp, NULL);
5374 }
5375
5376 } while (retry);
5377
5378 return(file);
5379 }
5380
5381 /** NOTE! Use the corresponding macro os_file_create_simple_no_error_handling(),
5382 not directly this function!
5383 A simple function to open or create a file.
5384 @param[in] name name of the file or path as a null-terminated
5385 string
5386 @param[in] create_mode create mode
5387 @param[in] access_type OS_FILE_READ_ONLY, OS_FILE_READ_WRITE, or
5388 OS_FILE_READ_ALLOW_DELETE; the last option is
5389 used by a backup program reading the file
5390 @param[out] success true if succeeded
5391 @return own: handle to the file, not defined if error, error number
5392 can be retrieved with os_file_get_last_error */
5393 pfs_os_file_t
os_file_create_simple_no_error_handling_func(const char * name,ulint create_mode,ulint access_type,bool read_only,bool * success)5394 os_file_create_simple_no_error_handling_func(
5395 const char* name,
5396 ulint create_mode,
5397 ulint access_type,
5398 bool read_only,
5399 bool* success)
5400 {
5401 pfs_os_file_t file;
5402
5403 *success = false;
5404
5405 DWORD access;
5406 DWORD create_flag;
5407 DWORD attributes = 0;
5408 DWORD share_mode = FILE_SHARE_READ;
5409
5410 ut_a(name);
5411
5412 ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
5413 ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
5414
5415 if (create_mode == OS_FILE_OPEN) {
5416
5417 create_flag = OPEN_EXISTING;
5418
5419 } else if (read_only) {
5420
5421 create_flag = OPEN_EXISTING;
5422
5423 } else if (create_mode == OS_FILE_CREATE) {
5424
5425 create_flag = CREATE_NEW;
5426
5427 } else {
5428
5429 ib::error()
5430 << "Unknown file create mode (" << create_mode << ") "
5431 << " for file '" << name << "'";
5432
5433 file.m_file = OS_FILE_CLOSED;
5434 return(file);
5435 }
5436
5437 if (access_type == OS_FILE_READ_ONLY) {
5438
5439 access = GENERIC_READ;
5440
5441 } else if (read_only) {
5442
5443 access = GENERIC_READ;
5444
5445 } else if (access_type == OS_FILE_READ_WRITE) {
5446
5447 access = GENERIC_READ | GENERIC_WRITE;
5448
5449 } else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
5450
5451 ut_a(!read_only);
5452
5453 access = GENERIC_READ;
5454
5455 /*!< A backup program has to give mysqld the maximum
5456 freedom to do what it likes with the file */
5457
5458 share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE;
5459 } else {
5460
5461 ib::error()
5462 << "Unknown file access type (" << access_type << ") "
5463 << "for file '" << name << "'";
5464
5465 file.m_file = OS_FILE_CLOSED;
5466 return(file);
5467 }
5468
5469 file.m_file = CreateFile((LPCTSTR) name,
5470 access,
5471 share_mode,
5472 NULL, // Security attributes
5473 create_flag,
5474 attributes,
5475 NULL); // No template file
5476
5477 *success = (file.m_file != INVALID_HANDLE_VALUE);
5478
5479 return(file);
5480 }
5481
5482 /** Deletes a file if it exists. The file has to be closed before calling this.
5483 @param[in] name file path as a null-terminated string
5484 @param[out] exist indicate if file pre-exist
5485 @return true if success */
5486 bool
os_file_delete_if_exists_func(const char * name,bool * exist)5487 os_file_delete_if_exists_func(
5488 const char* name,
5489 bool* exist)
5490 {
5491 ulint count = 0;
5492
5493 if (exist != NULL) {
5494 *exist = true;
5495 }
5496
5497 for (;;) {
5498 /* In Windows, deleting an .ibd file may fail if ibbackup
5499 is copying it */
5500
5501 bool ret = DeleteFile((LPCTSTR) name);
5502
5503 if (ret) {
5504 return(true);
5505 }
5506
5507 DWORD lasterr = GetLastError();
5508
5509 if (lasterr == ERROR_FILE_NOT_FOUND
5510 || lasterr == ERROR_PATH_NOT_FOUND) {
5511
5512 /* the file does not exist, this not an error */
5513 if (exist != NULL) {
5514 *exist = false;
5515 }
5516
5517 return(true);
5518 }
5519
5520 ++count;
5521
5522 if (count > 100 && 0 == (count % 10)) {
5523
5524 /* Print error information */
5525 os_file_get_last_error(true);
5526
5527 ib::warn() << "Delete of file '" << name << "' failed.";
5528 }
5529
5530 /* Sleep for a second */
5531 os_thread_sleep(1000000);
5532
5533 if (count > 2000) {
5534
5535 return(false);
5536 }
5537 }
5538 }
5539
5540 /** Deletes a file. The file has to be closed before calling this.
5541 @param[in] name File path as NUL terminated string
5542 @return true if success */
5543 bool
os_file_delete_func(const char * name)5544 os_file_delete_func(
5545 const char* name)
5546 {
5547 ulint count = 0;
5548
5549 for (;;) {
5550 /* In Windows, deleting an .ibd file may fail if ibbackup
5551 is copying it */
5552
5553 BOOL ret = DeleteFile((LPCTSTR) name);
5554
5555 if (ret) {
5556 return(true);
5557 }
5558
5559 if (GetLastError() == ERROR_FILE_NOT_FOUND) {
5560 /* If the file does not exist, we classify this as
5561 a 'mild' error and return */
5562
5563 return(false);
5564 }
5565
5566 ++count;
5567
5568 if (count > 100 && 0 == (count % 10)) {
5569
5570 /* print error information */
5571 os_file_get_last_error(true);
5572
5573 ib::warn()
5574 << "Cannot delete file '" << name << "'. Are "
5575 << "you running ibbackup to back up the file?";
5576 }
5577
5578 /* sleep for a second */
5579 os_thread_sleep(1000000);
5580
5581 if (count > 2000) {
5582
5583 return(false);
5584 }
5585 }
5586
5587 ut_error;
5588 return(false);
5589 }
5590
5591 /** NOTE! Use the corresponding macro os_file_rename(), not directly this
5592 function!
5593 Renames a file (can also move it to another directory). It is safest that the
5594 file is closed before calling this function.
5595 @param[in] oldpath old file path as a null-terminated string
5596 @param[in] newpath new file path
5597 @return true if success */
5598 bool
os_file_rename_func(const char * oldpath,const char * newpath)5599 os_file_rename_func(
5600 const char* oldpath,
5601 const char* newpath)
5602 {
5603 #ifdef UNIV_DEBUG
5604 os_file_type_t type;
5605 bool exists;
5606
5607 /* New path must not exist. */
5608 ut_ad(os_file_status(newpath, &exists, &type));
5609 ut_ad(!exists);
5610
5611 /* Old path must exist. */
5612 ut_ad(os_file_status(oldpath, &exists, &type));
5613 ut_ad(exists);
5614 #endif /* UNIV_DEBUG */
5615
5616 if (MoveFile((LPCTSTR) oldpath, (LPCTSTR) newpath)) {
5617 return(true);
5618 }
5619
5620 os_file_handle_error_no_exit(oldpath, "rename", false);
5621
5622 return(false);
5623 }
5624
5625 /** NOTE! Use the corresponding macro os_file_close(), not directly
5626 this function!
5627 Closes a file handle. In case of error, error number can be retrieved with
5628 os_file_get_last_error.
5629 @param[in,own] file Handle to a file
5630 @return true if success */
5631 bool
os_file_close_func(os_file_t file)5632 os_file_close_func(
5633 os_file_t file)
5634 {
5635 ut_a(file > 0);
5636
5637 if (CloseHandle(file)) {
5638 return(true);
5639 }
5640
5641 os_file_handle_error(NULL, "close");
5642
5643 return(false);
5644 }
5645
5646 /** Gets a file size.
5647 @param[in] file Handle to a file
5648 @return file size, or (os_offset_t) -1 on failure */
5649 os_offset_t
os_file_get_size(pfs_os_file_t file)5650 os_file_get_size(
5651 pfs_os_file_t file)
5652 {
5653 DWORD high;
5654 DWORD low;
5655
5656 low = GetFileSize(file.m_file, &high);
5657
5658 if (low == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
5659 return((os_offset_t) -1);
5660 }
5661
5662 return(os_offset_t(low | (os_offset_t(high) << 32)));
5663 }
5664
5665 /** Gets a file size.
5666 @param[in] filename Full path to the filename to check
5667 @return file size if OK, else set m_total_size to ~0 and m_alloc_size to
5668 errno */
5669 os_file_size_t
os_file_get_size(const char * filename)5670 os_file_get_size(
5671 const char* filename)
5672 {
5673 struct __stat64 s;
5674 os_file_size_t file_size;
5675
5676 int ret = _stat64(filename, &s);
5677
5678 if (ret == 0) {
5679
5680 file_size.m_total_size = s.st_size;
5681
5682 DWORD low_size;
5683 DWORD high_size;
5684
5685 low_size = GetCompressedFileSize(filename, &high_size);
5686
5687 if (low_size != INVALID_FILE_SIZE) {
5688
5689 file_size.m_alloc_size = high_size;
5690 file_size.m_alloc_size <<= 32;
5691 file_size.m_alloc_size |= low_size;
5692
5693 } else {
5694 ib::error()
5695 << "GetCompressedFileSize("
5696 << filename << ", ..) failed.";
5697
5698 file_size.m_alloc_size = (os_offset_t) -1;
5699 }
5700 } else {
5701 file_size.m_total_size = ~0;
5702 file_size.m_alloc_size = (os_offset_t) ret;
5703 }
5704
5705 return(file_size);
5706 }
5707
5708 /** This function returns information about the specified file
5709 @param[in] path pathname of the file
5710 @param[out] stat_info information of a file in a directory
5711 @param[in,out] statinfo information of a file in a directory
5712 @param[in] check_rw_perm for testing whether the file can be opened
5713 in RW mode
5714 @param[in] read_only true if the file is opened in read-only mode
5715 @return DB_SUCCESS if all OK */
5716 static
5717 dberr_t
os_file_get_status_win32(const char * path,os_file_stat_t * stat_info,struct _stat64 * statinfo,bool check_rw_perm,bool read_only)5718 os_file_get_status_win32(
5719 const char* path,
5720 os_file_stat_t* stat_info,
5721 struct _stat64* statinfo,
5722 bool check_rw_perm,
5723 bool read_only)
5724 {
5725 int ret = _stat64(path, statinfo);
5726
5727 if (ret && (errno == ENOENT || errno == ENOTDIR)) {
5728 /* file does not exist */
5729
5730 return(DB_NOT_FOUND);
5731
5732 } else if (ret) {
5733 /* file exists, but stat call failed */
5734
5735 os_file_handle_error_no_exit(path, "stat", false);
5736
5737 return(DB_FAIL);
5738
5739 } else if (_S_IFDIR & statinfo->st_mode) {
5740
5741 stat_info->type = OS_FILE_TYPE_DIR;
5742
5743 } else if (_S_IFREG & statinfo->st_mode) {
5744
5745 DWORD access = GENERIC_READ;
5746
5747 if (!read_only) {
5748 access |= GENERIC_WRITE;
5749 }
5750
5751 stat_info->type = OS_FILE_TYPE_FILE;
5752
5753 /* Check if we can open it in read-only mode. */
5754
5755 if (check_rw_perm) {
5756 HANDLE fh;
5757
5758 fh = CreateFile(
5759 (LPCTSTR) path, // File to open
5760 access,
5761 0, // No sharing
5762 NULL, // Default security
5763 OPEN_EXISTING, // Existing file only
5764 FILE_ATTRIBUTE_NORMAL, // Normal file
5765 NULL); // No attr. template
5766
5767 if (fh == INVALID_HANDLE_VALUE) {
5768 stat_info->rw_perm = false;
5769 } else {
5770 stat_info->rw_perm = true;
5771 CloseHandle(fh);
5772 }
5773 }
5774
5775 char volname[MAX_PATH];
5776 BOOL result = GetVolumePathName(path, volname, MAX_PATH);
5777
5778 if (!result) {
5779
5780 ib::error()
5781 << "os_file_get_status_win32: "
5782 << "Failed to get the volume path name for: "
5783 << path
5784 << "- OS error number " << GetLastError();
5785
5786 return(DB_FAIL);
5787 }
5788
5789 DWORD sectorsPerCluster;
5790 DWORD bytesPerSector;
5791 DWORD numberOfFreeClusters;
5792 DWORD totalNumberOfClusters;
5793
5794 result = GetDiskFreeSpace(
5795 (LPCSTR) volname,
5796 §orsPerCluster,
5797 &bytesPerSector,
5798 &numberOfFreeClusters,
5799 &totalNumberOfClusters);
5800
5801 if (!result) {
5802
5803 ib::error()
5804 << "GetDiskFreeSpace(" << volname << ",...) "
5805 << "failed "
5806 << "- OS error number " << GetLastError();
5807
5808 return(DB_FAIL);
5809 }
5810
5811 stat_info->block_size = bytesPerSector * sectorsPerCluster;
5812
5813 /* On Windows the block size is not used as the allocation
5814 unit for sparse files. The underlying infra-structure for
5815 sparse files is based on NTFS compression. The punch hole
5816 is done on a "compression unit". This compression unit
5817 is based on the cluster size. You cannot punch a hole if
5818 the cluster size >= 8K. For smaller sizes the table is
5819 as follows:
5820
5821 Cluster Size Compression Unit
5822 512 Bytes 8 KB
5823 1 KB 16 KB
5824 2 KB 32 KB
5825 4 KB 64 KB
5826
5827 Default NTFS cluster size is 4K, compression unit size of 64K.
5828 Therefore unless the user has created the file system with
5829 a smaller cluster size and used larger page sizes there is
5830 little benefit from compression out of the box. */
5831
5832 stat_info->block_size = (stat_info->block_size <= 4096)
5833 ? stat_info->block_size * 16 : ULINT_UNDEFINED;
5834 } else {
5835 stat_info->type = OS_FILE_TYPE_UNKNOWN;
5836 }
5837
5838 return(DB_SUCCESS);
5839 }
5840
5841 /** Truncates a file to a specified size in bytes.
5842 Do nothing if the size to preserve is greater or equal to the current
5843 size of the file.
5844 @param[in] pathname file path
5845 @param[in] file file to be truncated
5846 @param[in] size size to preserve in bytes
5847 @return true if success */
5848 static
5849 bool
os_file_truncate_win32(const char * pathname,pfs_os_file_t file,os_offset_t size)5850 os_file_truncate_win32(
5851 const char* pathname,
5852 pfs_os_file_t file,
5853 os_offset_t size)
5854 {
5855 LARGE_INTEGER length;
5856
5857 length.QuadPart = size;
5858 BOOL success = SetFilePointerEx(file.m_file, length, NULL, FILE_BEGIN);
5859 if (!success) {
5860 os_file_handle_error_no_exit(
5861 pathname, "SetFilePointerEx", false);
5862 } else {
5863 success = SetEndOfFile(file.m_file);
5864 if (!success) {
5865 os_file_handle_error_no_exit(
5866 pathname, "SetEndOfFile", false);
5867 }
5868 }
5869 return(success);
5870 }
5871
5872 /** Truncates a file at its current position.
5873 @param[in] file Handle to be truncated
5874 @return true if success */
5875 bool
os_file_set_eof(FILE * file)5876 os_file_set_eof(
5877 FILE* file)
5878 {
5879 HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
5880
5881 return(SetEndOfFile(h));
5882 }
5883
5884 /** Closes a file handle.
5885 @param[in] file Handle to close
5886 @return true if success */
5887 bool
os_file_close_no_error_handling_func(os_file_t file)5888 os_file_close_no_error_handling_func(
5889 os_file_t file)
5890 {
5891 return(CloseHandle(file) ? true : false);
5892 }
5893
5894 /** This function can be called if one wants to post a batch of reads and
5895 prefers an i/o-handler thread to handle them all at once later. You must
5896 call os_aio_simulated_wake_handler_threads later to ensure the threads
5897 are not left sleeping! */
5898 void
os_aio_simulated_put_read_threads_to_sleep()5899 os_aio_simulated_put_read_threads_to_sleep()
5900 {
5901 AIO::simulated_put_read_threads_to_sleep();
5902 }
5903
5904 /** This function can be called if one wants to post a batch of reads and
5905 prefers an i/o-handler thread to handle them all at once later. You must
5906 call os_aio_simulated_wake_handler_threads later to ensure the threads
5907 are not left sleeping! */
5908 void
simulated_put_read_threads_to_sleep()5909 AIO::simulated_put_read_threads_to_sleep()
5910 {
5911 /* The idea of putting background IO threads to sleep is only for
5912 Windows when using simulated AIO. Windows XP seems to schedule
5913 background threads too eagerly to allow for coalescing during
5914 readahead requests. */
5915
5916 if (srv_use_native_aio) {
5917 /* We do not use simulated AIO: do nothing */
5918
5919 return;
5920 }
5921
5922 os_aio_recommend_sleep_for_read_threads = true;
5923
5924 for (ulint i = 0; i < os_aio_n_segments; i++) {
5925 AIO* array;
5926
5927 get_array_and_local_segment(&array, i);
5928
5929 if (array == s_reads) {
5930
5931 os_event_reset(os_aio_segment_wait_events[i]);
5932 }
5933 }
5934 }
5935
5936 #endif /* !_WIN32*/
5937
5938 /** Does a syncronous read or write depending upon the type specified
5939 In case of partial reads/writes the function tries
5940 NUM_RETRIES_ON_PARTIAL_IO times to read/write the complete data.
5941 @param[in] type, IO flags
5942 @param[in] file handle to an open file
5943 @param[out] buf buffer where to read
5944 @param[in] offset file offset from the start where to read
5945 @param[in] n number of bytes to read, starting from offset
5946 @param[out] err DB_SUCCESS or error code
5947 @return number of bytes read/written, -1 if error */
5948 static MY_ATTRIBUTE((warn_unused_result))
5949 ssize_t
os_file_io(const IORequest & in_type,os_file_t file,void * buf,ulint n,os_offset_t offset,dberr_t * err)5950 os_file_io(
5951 const IORequest&in_type,
5952 os_file_t file,
5953 void* buf,
5954 ulint n,
5955 os_offset_t offset,
5956 dberr_t* err)
5957 {
5958 Block* block = NULL;
5959 ulint original_n = n;
5960 IORequest type = in_type;
5961 ssize_t bytes_returned = 0;
5962 byte* encrypt_log_buf = NULL;
5963
5964 if (type.is_compressed()) {
5965
5966 /* We don't compress the first page of any file. */
5967 ut_ad(offset > 0);
5968
5969 block = os_file_compress_page(type, buf, &n);
5970 } else {
5971 block = NULL;
5972 }
5973
5974 /* We do encryption after compression, since if we do encryption
5975 before compression, the encrypted data will cause compression fail
5976 or low compression rate. */
5977 if (type.is_encrypted() && type.is_write() &&
5978 (type.encryption_algorithm().m_type != Encryption::KEYRING ||
5979 (type.encryption_algorithm().m_key != NULL &&
5980 Encryption::can_page_be_keyring_encrypted(reinterpret_cast<byte*>(buf))
5981 )
5982 )) {
5983 if (!type.is_log()) {
5984 /* We don't encrypt the first page of any file. */
5985 Block* compressed_block = block;
5986 ut_ad(offset > 0);
5987
5988 ut_ad(type.encryption_algorithm().m_key != NULL);
5989 block = os_file_encrypt_page(type, buf, &n);
5990
5991 if (compressed_block != NULL) {
5992 os_free_block(compressed_block);
5993 }
5994 } else {
5995 /* Skip encrypt log file header */
5996 if (offset >= LOG_FILE_HDR_SIZE) {
5997 block = os_file_encrypt_log(type,
5998 buf,
5999 encrypt_log_buf,
6000 &n);
6001 }
6002 }
6003 }
6004
6005 SyncFileIO sync_file_io(file, buf, n, offset);
6006
6007 for (ulint i = 0; i < NUM_RETRIES_ON_PARTIAL_IO; ++i) {
6008
6009 ssize_t n_bytes = sync_file_io.execute(type);
6010
6011 /* Check for a hard error. Not much we can do now. */
6012 if (n_bytes < 0) {
6013
6014 break;
6015
6016 } else if ((ulint) n_bytes + bytes_returned == n) {
6017
6018 bytes_returned += n_bytes;
6019
6020 if (offset > 0
6021 && (type.is_compressed() || type.is_read())) {
6022
6023 *err = os_file_io_complete(
6024 type, file,
6025 reinterpret_cast<byte*>(buf),
6026 NULL, original_n, offset, n);
6027 } else {
6028
6029 *err = DB_SUCCESS;
6030 }
6031
6032 if (block != NULL) {
6033 os_free_block(block);
6034 }
6035
6036 if (encrypt_log_buf != NULL) {
6037 ut_free(encrypt_log_buf);
6038 }
6039
6040 return(original_n);
6041 }
6042
6043 /* Handle partial read/write. */
6044
6045 ut_ad((ulint) n_bytes + bytes_returned < n);
6046
6047 bytes_returned += (ulint) n_bytes;
6048
6049 if (!type.is_partial_io_warning_disabled()) {
6050
6051 const char* op = type.is_read()
6052 ? "read" : "written";
6053
6054 ib::warn()
6055 << n
6056 << " bytes should have been " << op << ". Only "
6057 << bytes_returned
6058 << " bytes " << op << ". Retrying"
6059 << " for the remaining bytes.";
6060 }
6061
6062 /* Advance the offset and buffer by n_bytes */
6063 sync_file_io.advance(n_bytes);
6064 }
6065
6066 if (block != NULL) {
6067 os_free_block(block);
6068 }
6069
6070 if (encrypt_log_buf != NULL) {
6071 ut_free(encrypt_log_buf);
6072 }
6073
6074 *err = DB_IO_ERROR;
6075
6076 if (!type.is_partial_io_warning_disabled()) {
6077 ib::warn()
6078 << "Retry attempts for "
6079 << (type.is_read() ? "reading" : "writing")
6080 << " partial data failed.";
6081 }
6082
6083 return(bytes_returned);
6084 }
6085
6086 /** Does a synchronous write operation in Posix.
6087 @param[in] type IO context
6088 @param[in] file handle to an open file
6089 @param[out] buf buffer from which to write
6090 @param[in] n number of bytes to read, starting from offset
6091 @param[in] offset file offset from the start where to read
6092 @param[out] err DB_SUCCESS or error code
6093 @return number of bytes written, -1 if error */
6094 static MY_ATTRIBUTE((warn_unused_result))
6095 ssize_t
os_file_pwrite(IORequest & type,os_file_t file,const byte * buf,ulint n,os_offset_t offset,dberr_t * err)6096 os_file_pwrite(
6097 IORequest& type,
6098 os_file_t file,
6099 const byte* buf,
6100 ulint n,
6101 os_offset_t offset,
6102 dberr_t* err)
6103 {
6104 ut_ad(type.validate());
6105
6106 ++os_n_file_writes;
6107
6108 (void) os_atomic_increment_ulint(&os_n_pending_writes, 1);
6109 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES);
6110
6111 ssize_t n_bytes = os_file_io(type, file, (void*) buf, n, offset, err);
6112
6113 DBUG_EXECUTE_IF("xb_simulate_all_o_direct_write_failure",
6114 n_bytes = -1;
6115 errno = EINVAL;
6116 *err = DB_IO_ERROR;);
6117
6118 (void) os_atomic_decrement_ulint(&os_n_pending_writes, 1);
6119 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES);
6120
6121 return(n_bytes);
6122 }
6123
6124 /** Requests a synchronous write operation.
6125 @param[in] type IO flags
6126 @param[in] file handle to an open file
6127 @param[out] buf buffer from which to write
6128 @param[in] offset file offset from the start where to read
6129 @param[in] n number of bytes to read, starting from offset
6130 @return DB_SUCCESS if request was successful, false if fail */
6131 static MY_ATTRIBUTE((warn_unused_result))
6132 dberr_t
os_file_write_page(IORequest & type,const char * name,os_file_t file,const byte * buf,os_offset_t offset,ulint n)6133 os_file_write_page(
6134 IORequest& type,
6135 const char* name,
6136 os_file_t file,
6137 const byte* buf,
6138 os_offset_t offset,
6139 ulint n)
6140 {
6141 dberr_t err;
6142 ut_ad(type.validate());
6143 ut_ad(n > 0);
6144
6145 ssize_t n_bytes = os_file_pwrite(type, file, buf, n, offset, &err);
6146
6147 if ((ulint) n_bytes != n && !os_has_said_disk_full) {
6148
6149 ib::error()
6150 << "Write to file " << name << "failed at offset "
6151 << offset << ", " << n
6152 << " bytes should have been written,"
6153 " only " << n_bytes << " were written."
6154 " Operating system error number " << errno << "."
6155 " Check that your OS and file system"
6156 " support files of this size."
6157 " Check also that the disk is not full"
6158 " or a disk quota exceeded.";
6159
6160 if (strerror(errno) != NULL) {
6161
6162 ib::error()
6163 << "Error number " << errno
6164 << " means '" << strerror(errno) << "'";
6165 }
6166
6167 ib::info() << OPERATING_SYSTEM_ERROR_MSG;
6168
6169 os_diagnose_all_o_direct_einval(errno);
6170
6171 os_has_said_disk_full = true;
6172 }
6173
6174 return(err);
6175 }
6176
6177 /** Does a synchronous read operation in Posix.
6178 @param[in] type IO flags
6179 @param[in] file handle to an open file
6180 @param[out] buf buffer where to read
6181 @param[in] offset file offset from the start where to read
6182 @param[in] n number of bytes to read, starting from offset
6183 @param[out] err DB_SUCCESS or error code
6184 @return number of bytes read, -1 if error */
6185 static MY_ATTRIBUTE((warn_unused_result))
6186 ssize_t
os_file_pread(IORequest & type,os_file_t file,void * buf,ulint n,os_offset_t offset,trx_t * trx,dberr_t * err)6187 os_file_pread(
6188 IORequest& type,
6189 os_file_t file,
6190 void* buf,
6191 ulint n,
6192 os_offset_t offset,
6193 trx_t* trx,
6194 dberr_t* err)
6195 {
6196 ++os_n_file_reads;
6197
6198 const ib_time_monotonic_us_t start_time = trx_stats::start_io_read(trx, n);
6199
6200 (void) os_atomic_increment_ulint(&os_n_pending_reads, 1);
6201 MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS);
6202
6203 ssize_t n_bytes = os_file_io(type, file, buf, n, offset, err);
6204
6205 DBUG_EXECUTE_IF("xb_simulate_all_o_direct_read_failure",
6206 n_bytes = -1;
6207 errno = EINVAL;);
6208
6209 trx_stats::end_io_read(trx, start_time);
6210
6211 (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1);
6212 MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS);
6213
6214 return(n_bytes);
6215 }
6216
6217 /** Requests a synchronous positioned read operation.
6218 @return DB_SUCCESS if request was successful, false if fail
6219 @param[in] type IO flags
6220 @param[in] file handle to an open file
6221 @param[out] buf buffer where to read
6222 @param[in] offset file offset from the start where to read
6223 @param[in] n number of bytes to read, starting from offset
6224 @param[out] o number of bytes actually read
6225 @param[in] exit_on_err if true then exit on error
6226 @return DB_SUCCESS or error code */
6227 static MY_ATTRIBUTE((warn_unused_result))
6228 dberr_t
os_file_read_page(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o,bool exit_on_err,trx_t * trx)6229 os_file_read_page(
6230 IORequest& type,
6231 os_file_t file,
6232 void* buf,
6233 os_offset_t offset,
6234 ulint n,
6235 ulint* o,
6236 bool exit_on_err,
6237 trx_t* trx)
6238 {
6239 dberr_t err;
6240
6241 os_bytes_read_since_printout += n;
6242
6243 ut_ad(type.validate());
6244 ut_ad(n > 0);
6245
6246 for (;;) {
6247 ssize_t n_bytes;
6248
6249 n_bytes = os_file_pread(type, file, buf, n, offset, trx, &err);
6250
6251 if (o != NULL) {
6252 *o = n_bytes;
6253 }
6254
6255 if (err != DB_SUCCESS && !exit_on_err) {
6256
6257 return(err);
6258
6259 } else if ((ulint) n_bytes == n) {
6260
6261 /*The page decryption failed - will handled by buf_io_comptelete*/
6262 if (err == DB_IO_DECRYPT_FAIL)
6263 return (DB_IO_DECRYPT_FAIL);
6264
6265 /** The read will succeed but decompress can fail
6266 for various reasons. */
6267
6268 if (type.is_compression_enabled()
6269 && !Compression::is_compressed_page(
6270 static_cast<byte*>(buf))) {
6271
6272 return(DB_SUCCESS);
6273
6274 } else {
6275 return(err);
6276 }
6277 }
6278
6279 const std::string fd_path = os_file_find_path_for_fd(file);
6280 if (!fd_path.empty()) {
6281 ib::error() << "Tried to read " << n
6282 << " bytes at offset " << offset
6283 << ", but was only able to read " << n_bytes
6284 << " of FD " << file
6285 << ", filename " << fd_path;
6286 } else {
6287 ib::error() << "Tried to read " << n
6288 << " bytes at offset " << offset
6289 << ", but was only able to read " << n_bytes;
6290 }
6291
6292 if (exit_on_err) {
6293
6294 if (!os_file_handle_error(NULL, "read")) {
6295 /* Hard error */
6296 break;
6297 }
6298
6299 } else if (!os_file_handle_error_no_exit(NULL, "read", false)) {
6300
6301 /* Hard error */
6302 break;
6303 }
6304
6305 if (n_bytes > 0 && (ulint) n_bytes < n) {
6306 n -= (ulint) n_bytes;
6307 offset += (ulint) n_bytes;
6308 buf = reinterpret_cast<uchar*>(buf) + (ulint) n_bytes;
6309 }
6310 }
6311
6312 ib::fatal()
6313 << "Cannot read from file. OS error number "
6314 << errno << ".";
6315
6316 return(err);
6317 }
6318
6319 /** Retrieves the last error number if an error occurs in a file io function.
6320 The number should be retrieved before any other OS calls (because they may
6321 overwrite the error number). If the number is not known to this program,
6322 the OS error number + 100 is returned.
6323 @param[in] report_all_errors true if we want an error printed
6324 for all errors
6325 @return error number, or OS error number + 100 */
6326 ulint
os_file_get_last_error(bool report_all_errors)6327 os_file_get_last_error(
6328 bool report_all_errors)
6329 {
6330 return(os_file_get_last_error_low(report_all_errors, false));
6331 }
6332
6333 /** Does error handling when a file operation fails.
6334 Conditionally exits (calling srv_fatal_error()) based on should_exit value
6335 and the error type, if should_exit is true then on_error_silent is ignored.
6336 @param[in] name name of a file or NULL
6337 @param[in] operation operation
6338 @param[in] should_exit call srv_fatal_error() on an unknown error,
6339 if this parameter is true
6340 @param[in] on_error_silent if true then don't print any message to the log
6341 iff it is an unknown non-fatal error
6342 @return true if we should retry the operation */
6343 static MY_ATTRIBUTE((warn_unused_result))
6344 bool
os_file_handle_error_cond_exit(const char * name,const char * operation,bool should_exit,bool on_error_silent)6345 os_file_handle_error_cond_exit(
6346 const char* name,
6347 const char* operation,
6348 bool should_exit,
6349 bool on_error_silent)
6350 {
6351 ulint err;
6352
6353 err = os_file_get_last_error_low(false, on_error_silent);
6354
6355 switch (err) {
6356 case OS_FILE_DISK_FULL:
6357 /* We only print a warning about disk full once */
6358
6359 if (os_has_said_disk_full) {
6360
6361 return(false);
6362 }
6363
6364 /* Disk full error is reported irrespective of the
6365 on_error_silent setting. */
6366
6367 if (name) {
6368
6369 ib::error()
6370 << "Encountered a problem with file '"
6371 << name << "'";
6372 }
6373
6374 ib::error()
6375 << "Disk is full. Try to clean the disk to free space.";
6376
6377 os_has_said_disk_full = true;
6378
6379 return(false);
6380
6381 case OS_FILE_AIO_RESOURCES_RESERVED:
6382 case OS_FILE_AIO_INTERRUPTED:
6383
6384 return(true);
6385
6386 case OS_FILE_PATH_ERROR:
6387 case OS_FILE_ALREADY_EXISTS:
6388 case OS_FILE_ACCESS_VIOLATION:
6389
6390 return(false);
6391
6392 case OS_FILE_SHARING_VIOLATION:
6393
6394 os_thread_sleep(10000000); /* 10 sec */
6395 return(true);
6396
6397 case OS_FILE_OPERATION_ABORTED:
6398 case OS_FILE_INSUFFICIENT_RESOURCE:
6399
6400 os_thread_sleep(100000); /* 100 ms */
6401 return(true);
6402
6403 default:
6404
6405 /* If it is an operation that can crash on error then it
6406 is better to ignore on_error_silent and print an error message
6407 to the log. */
6408
6409 if (should_exit || !on_error_silent) {
6410 ib::error() << "File "
6411 << (name != NULL ? name : "(unknown)")
6412 << ": '" << operation << "'"
6413 " returned OS error " << err << "."
6414 << (should_exit
6415 ? " Cannot continue operation" : "");
6416 }
6417
6418 if (should_exit) {
6419 srv_fatal_error();
6420 }
6421 }
6422
6423 return(false);
6424 }
6425
6426 /** Does error handling when a file operation fails.
6427 @param[in] name name of a file or NULL
6428 @param[in] operation operation name that failed
6429 @return true if we should retry the operation */
6430 static
6431 bool
os_file_handle_error(const char * name,const char * operation)6432 os_file_handle_error(
6433 const char* name,
6434 const char* operation)
6435 {
6436 /* Exit in case of unknown error */
6437 return(os_file_handle_error_cond_exit(name, operation, true, false));
6438 }
6439
6440 /** Does error handling when a file operation fails.
6441 @param[in] name name of a file or NULL
6442 @param[in] operation operation name that failed
6443 @param[in] on_error_silent if true then don't print any message to the log.
6444 @return true if we should retry the operation */
6445 static
6446 bool
os_file_handle_error_no_exit(const char * name,const char * operation,bool on_error_silent)6447 os_file_handle_error_no_exit(
6448 const char* name,
6449 const char* operation,
6450 bool on_error_silent)
6451 {
6452 /* Don't exit in case of unknown error */
6453 return(os_file_handle_error_cond_exit(
6454 name, operation, false, on_error_silent));
6455 }
6456
6457 /** Tries to disable OS caching on an opened file descriptor.
6458 @param[in] fd file descriptor to alter
6459 @param[in] file_name file name, used in the diagnostic message
6460 @param[in] name "open" or "create"; used in the diagnostic
6461 message
6462 @param[in] failure_warning if true (the default), the failure to disable
6463 caching is diagnosed at warning severity, and at note severity otherwise
6464 @return true if operation is success and false */
6465 bool
os_file_set_nocache(int fd MY_ATTRIBUTE ((unused)),const char * file_name MY_ATTRIBUTE ((unused)),const char * operation_name MY_ATTRIBUTE ((unused)),bool failure_warning MY_ATTRIBUTE ((unused)))6466 os_file_set_nocache(
6467 int fd MY_ATTRIBUTE((unused)),
6468 const char* file_name MY_ATTRIBUTE((unused)),
6469 const char* operation_name MY_ATTRIBUTE((unused)),
6470 bool failure_warning MY_ATTRIBUTE((unused)))
6471 {
6472 /* some versions of Solaris may not have DIRECTIO_ON */
6473 #if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
6474 if (directio(fd, DIRECTIO_ON) == -1) {
6475 int errno_save = errno;
6476
6477 ib::error()
6478 << "Failed to set DIRECTIO_ON on file "
6479 << file_name << ": " << operation_name
6480 << strerror(errno_save) << ","
6481 " continuing anyway.";
6482 return false;
6483 }
6484 #elif defined(O_DIRECT)
6485 if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
6486 int errno_save = errno;
6487 static bool warning_message_printed = false;
6488 if (errno_save == EINVAL) {
6489 if (!warning_message_printed) {
6490 warning_message_printed = true;
6491 # ifdef UNIV_LINUX
6492 ib::warn_or_info(failure_warning)
6493 << "Failed to set O_DIRECT on file "
6494 << file_name << ";" << operation_name
6495 << ": " << strerror(errno_save) << ", "
6496 << "continuing anyway. O_DIRECT is "
6497 "known to result in 'Invalid argument' "
6498 "on Linux on tmpfs, "
6499 "see MySQL Bug#26662.";
6500 # else /* UNIV_LINUX */
6501 goto short_warning;
6502 # endif /* UNIV_LINUX */
6503 }
6504 } else {
6505 # ifndef UNIV_LINUX
6506 short_warning:
6507 # endif
6508 ib::warn_or_info(failure_warning)
6509 << "Failed to set O_DIRECT on file "
6510 << file_name << "; " << operation_name
6511 << " : " << strerror(errno_save)
6512 << " continuing anyway.";
6513 }
6514 return false;
6515 }
6516 #endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */
6517 return true;
6518 }
6519
6520 /** Write the specified number of zeros to a newly created file.
6521 @param[in] name name of the file or path as a null-terminated
6522 string
6523 @param[in] file handle to a file
6524 @param[in] size file size
6525 @param[in] read_only Enable read-only checks if true
6526 @return true if success */
6527 bool
os_file_set_size(const char * name,pfs_os_file_t file,os_offset_t size,bool read_only)6528 os_file_set_size(
6529 const char* name,
6530 pfs_os_file_t file,
6531 os_offset_t size,
6532 bool read_only)
6533 {
6534 /* Write up to 1 megabyte at a time. */
6535 ulint buf_size = ut_min(
6536 static_cast<ulint>(64),
6537 static_cast<ulint>(size / UNIV_PAGE_SIZE));
6538
6539 buf_size *= UNIV_PAGE_SIZE;
6540
6541 /* Align the buffer for possible raw i/o */
6542 byte* buf2;
6543
6544 buf2 = static_cast<byte*>(ut_malloc_nokey(buf_size + UNIV_PAGE_SIZE));
6545
6546 byte* buf = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
6547
6548 /* Write buffer full of zeros */
6549 memset(buf, 0, buf_size);
6550
6551 if (size >= (os_offset_t) 100 << 20) {
6552
6553 ib::info() << "Progress in MB:";
6554 }
6555
6556 os_offset_t current_size = 0;
6557
6558 while (current_size < size) {
6559 ulint n_bytes;
6560
6561 if (size - current_size < (os_offset_t) buf_size) {
6562 n_bytes = (ulint) (size - current_size);
6563 } else {
6564 n_bytes = buf_size;
6565 }
6566
6567 dberr_t err;
6568 IORequest request(IORequest::WRITE);
6569
6570 #ifdef UNIV_HOTBACKUP
6571
6572 err = os_file_write(
6573 request, name, file, buf, current_size, n_bytes);
6574 #else
6575 /* Using OS_AIO_SYNC mode on POSIX systems will result in
6576 fall back to os_file_write/read. On Windows it will use
6577 special mechanism to wait before it returns back. */
6578
6579 err = os_aio(
6580 request,
6581 OS_AIO_SYNC, name,
6582 file, buf, current_size, n_bytes,
6583 read_only, NULL, NULL, 0, NULL, false);
6584 #endif /* UNIV_HOTBACKUP */
6585
6586 if (err != DB_SUCCESS) {
6587
6588 ut_free(buf2);
6589 return(false);
6590 }
6591
6592 /* Print about progress for each 100 MB written */
6593 if ((current_size + n_bytes) / (100 << 20)
6594 != current_size / (100 << 20)) {
6595
6596 fprintf(stderr, " %lu00",
6597 (ulong) ((current_size + n_bytes)
6598 / (100 << 20)));
6599 }
6600
6601 current_size += n_bytes;
6602 }
6603
6604 if (size >= (os_offset_t) 100 << 20) {
6605
6606 fprintf(stderr, "\n");
6607 }
6608
6609 ut_free(buf2);
6610
6611 return(os_file_flush(file));
6612 }
6613
6614 /** Truncates a file to a specified size in bytes.
6615 Do nothing if the size to preserve is greater or equal to the current
6616 size of the file.
6617 @param[in] pathname file path
6618 @param[in] file file to be truncated
6619 @param[in] size size to preserve in bytes
6620 @return true if success */
6621 bool
os_file_truncate(const char * pathname,pfs_os_file_t file,os_offset_t size)6622 os_file_truncate(
6623 const char* pathname,
6624 pfs_os_file_t file,
6625 os_offset_t size)
6626 {
6627 /* Do nothing if the size preserved is larger than or equal to the
6628 current size of file */
6629 os_offset_t size_bytes = os_file_get_size(file);
6630
6631 if (size >= size_bytes) {
6632 return(true);
6633 }
6634
6635 #ifdef _WIN32
6636 return(os_file_truncate_win32(pathname, file, size));
6637 #else /* _WIN32 */
6638 return(os_file_truncate_posix(pathname, file, size));
6639 #endif /* _WIN32 */
6640 }
6641
6642 /** NOTE! Use the corresponding macro os_file_read(), not directly this
6643 function!
6644 Requests a synchronous positioned read operation.
6645 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6646 @param[in] type IO flags
6647 @param[in] file handle to an open file
6648 @param[out] buf buffer where to read
6649 @param[in] offset file offset from the start where to read
6650 @param[in] n number of bytes to read, starting from offset
6651 @return DB_SUCCESS or error code */
6652 dberr_t
os_file_read_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,trx_t * trx)6653 os_file_read_func(
6654 IORequest& type,
6655 os_file_t file,
6656 void* buf,
6657 os_offset_t offset,
6658 ulint n,
6659 trx_t* trx)
6660 {
6661 ut_ad(type.is_read());
6662
6663 return(os_file_read_page(type, file, buf, offset, n, NULL, true, trx));
6664 }
6665
6666 /** NOTE! Use the corresponding macro os_file_read_no_error_handling(),
6667 not directly this function!
6668 Requests a synchronous positioned read operation.
6669 @return DB_SUCCESS if request was successful, DB_IO_ERROR on failure
6670 @param[in] type IO flags
6671 @param[in] file handle to an open file
6672 @param[out] buf buffer where to read
6673 @param[in] offset file offset from the start where to read
6674 @param[in] n number of bytes to read, starting from offset
6675 @param[out] o number of bytes actually read
6676 @return DB_SUCCESS or error code */
6677 dberr_t
os_file_read_no_error_handling_func(IORequest & type,os_file_t file,void * buf,os_offset_t offset,ulint n,ulint * o)6678 os_file_read_no_error_handling_func(
6679 IORequest& type,
6680 os_file_t file,
6681 void* buf,
6682 os_offset_t offset,
6683 ulint n,
6684 ulint* o)
6685 {
6686 ut_ad(type.is_read());
6687
6688 return(os_file_read_page(type, file, buf, offset, n, o, false, NULL));
6689 }
6690
6691 /** NOTE! Use the corresponding macro os_file_write(), not directly
6692 Requests a synchronous write operation.
6693 @param[in] type IO flags
6694 @param[in] file handle to an open file
6695 @param[out] buf buffer from which to write
6696 @param[in] offset file offset from the start where to read
6697 @param[in] n number of bytes to read, starting from offset
6698 @return DB_SUCCESS if request was successful, false if fail */
6699 dberr_t
os_file_write_func(IORequest & type,const char * name,os_file_t file,const void * buf,os_offset_t offset,ulint n)6700 os_file_write_func(
6701 IORequest& type,
6702 const char* name,
6703 os_file_t file,
6704 const void* buf,
6705 os_offset_t offset,
6706 ulint n)
6707 {
6708 ut_ad(type.validate());
6709 ut_ad(type.is_write());
6710
6711 /* We never compress the first page.
6712 Note: This assumes we always do block IO. */
6713 if (offset == 0) {
6714 type.clear_compressed();
6715 }
6716
6717 const byte* ptr = reinterpret_cast<const byte*>(buf);
6718
6719 return(os_file_write_page(type, name, file, ptr, offset, n));
6720 }
6721
6722 /** Check the existence and type of the given file.
6723 @param[in] path path name of file
6724 @param[out] exists true if the file exists
6725 @param[out] type Type of the file, if it exists
6726 @return true if call succeeded */
6727 bool
os_file_status(const char * path,bool * exists,os_file_type_t * type)6728 os_file_status(
6729 const char* path,
6730 bool* exists,
6731 os_file_type_t* type)
6732 {
6733 #ifdef _WIN32
6734 return(os_file_status_win32(path, exists, type));
6735 #else
6736 return(os_file_status_posix(path, exists, type));
6737 #endif /* _WIN32 */
6738 }
6739
6740 /** Free storage space associated with a section of the file.
6741 @param[in] fh Open file handle
6742 @param[in] off Starting offset (SEEK_SET)
6743 @param[in] len Size of the hole
6744 @return DB_SUCCESS or error code */
6745 dberr_t
os_file_punch_hole(os_file_t fh,os_offset_t off,os_offset_t len)6746 os_file_punch_hole(
6747 os_file_t fh,
6748 os_offset_t off,
6749 os_offset_t len)
6750 {
6751 /* In this debugging mode, we act as if punch hole is supported,
6752 and then skip any calls to actually punch a hole here.
6753 In this way, Transparent Page Compression is still being tested. */
6754 DBUG_EXECUTE_IF("ignore_punch_hole",
6755 return(DB_SUCCESS);
6756 );
6757
6758 #ifdef _WIN32
6759 return(os_file_punch_hole_win32(fh, off, len));
6760 #else
6761 return(os_file_punch_hole_posix(fh, off, len));
6762 #endif /* _WIN32 */
6763 }
6764
6765 /** Check if the file system supports sparse files.
6766
6767 Warning: On POSIX systems we try and punch a hole from offset 0 to
6768 the system configured page size. This should only be called on an empty
6769 file.
6770
6771 Note: On Windows we use the name and on Unices we use the file handle.
6772
6773 @param[in] name File name
6774 @param[in] fh File handle for the file - if opened
6775 @return true if the file system supports sparse files */
6776 bool
os_is_sparse_file_supported(const char * path,pfs_os_file_t fh)6777 os_is_sparse_file_supported(const char* path, pfs_os_file_t fh)
6778 {
6779 /* In this debugging mode, we act as if punch hole is supported,
6780 then we skip any calls to actually punch a hole. In this way,
6781 Transparent Page Compression is still being tested. */
6782 DBUG_EXECUTE_IF("ignore_punch_hole",
6783 return(true);
6784 );
6785
6786 #ifdef _WIN32
6787 return(os_is_sparse_file_supported_win32(path));
6788 #else
6789 dberr_t err;
6790
6791 /* We don't know the FS block size, use the sector size. The FS
6792 will do the magic. */
6793 err = os_file_punch_hole(fh.m_file, 0, UNIV_PAGE_SIZE);
6794
6795 return(err == DB_SUCCESS);
6796 #endif /* _WIN32 */
6797 }
6798
6799 /** This function returns information about the specified file
6800 @param[in] path pathname of the file
6801 @param[out] stat_info information of a file in a directory
6802 @param[in] check_rw_perm for testing whether the file can be opened
6803 in RW mode
6804 @param[in] read_only true if file is opened in read-only mode
6805 @return DB_SUCCESS if all OK */
6806 dberr_t
os_file_get_status(const char * path,os_file_stat_t * stat_info,bool check_rw_perm,bool read_only)6807 os_file_get_status(
6808 const char* path,
6809 os_file_stat_t* stat_info,
6810 bool check_rw_perm,
6811 bool read_only)
6812 {
6813 dberr_t ret;
6814
6815 #ifdef _WIN32
6816 struct _stat64 info;
6817
6818 ret = os_file_get_status_win32(
6819 path, stat_info, &info, check_rw_perm, read_only);
6820
6821 #else
6822 struct stat info;
6823
6824 ret = os_file_get_status_posix(
6825 path, stat_info, &info, check_rw_perm, read_only);
6826
6827 #endif /* _WIN32 */
6828
6829 if (ret == DB_SUCCESS) {
6830 stat_info->ctime = info.st_ctime;
6831 stat_info->atime = info.st_atime;
6832 stat_info->mtime = info.st_mtime;
6833 stat_info->size = info.st_size;
6834 }
6835
6836 return(ret);
6837 }
6838
6839 /**
6840 Waits for an AIO operation to complete. This function is used to wait the
6841 for completed requests. The aio array of pending requests is divided
6842 into segments. The thread specifies which segment or slot it wants to wait
6843 for. NOTE: this function will also take care of freeing the aio slot,
6844 therefore no other thread is allowed to do the freeing!
6845 @param[in] segment The number of the segment in the aio arrays to
6846 wait for; segment 0 is the ibuf I/O thread,
6847 segment 1 the log I/O thread, then follow the
6848 non-ibuf read threads, and as the last are the
6849 non-ibuf write threads; if this is
6850 ULINT_UNDEFINED, then it means that sync AIO
6851 is used, and this parameter is ignored
6852 @param[out] m1 the messages passed with the AIO request; note
6853 that also in the case where the AIO operation
6854 failed, these output parameters are valid and
6855 can be used to restart the operation,
6856 for example
6857 @param[out] m2 callback message
6858 @param[out] type OS_FILE_WRITE or ..._READ
6859 @return DB_SUCCESS or error code */
6860 dberr_t
os_aio_handler(ulint segment,fil_node_t ** m1,void ** m2,IORequest * request)6861 os_aio_handler(
6862 ulint segment,
6863 fil_node_t** m1,
6864 void** m2,
6865 IORequest* request)
6866 {
6867 dberr_t err;
6868
6869 if (srv_use_native_aio) {
6870 srv_set_io_thread_op_info(segment, "native aio handle");
6871
6872 #ifdef WIN_ASYNC_IO
6873
6874 err = os_aio_windows_handler(segment, 0, m1, m2, request);
6875
6876 #elif defined(LINUX_NATIVE_AIO)
6877
6878 err = os_aio_linux_handler(segment, m1, m2, request);
6879
6880 #else
6881 ut_error;
6882
6883 err = DB_ERROR; /* Eliminate compiler warning */
6884
6885 #endif /* WIN_ASYNC_IO */
6886
6887 } else {
6888 srv_set_io_thread_op_info(segment, "simulated aio handle");
6889
6890 err = os_aio_simulated_handler(segment, m1, m2, request);
6891 }
6892
6893 return(err);
6894 }
6895
6896 /** Constructor
6897 @param[in] id The latch ID
6898 @param[in] n Number of AIO slots
6899 @param[in] segments Number of segments */
AIO(latch_id_t id,ulint n,ulint segments)6900 AIO::AIO(
6901 latch_id_t id,
6902 ulint n,
6903 ulint segments)
6904 :
6905 m_slots(n),
6906 m_n_segments(segments),
6907 m_n_reserved()
6908 # ifdef LINUX_NATIVE_AIO
6909 ,m_aio_ctx(),
6910 m_events(m_slots.size())
6911 ,m_pending(NULL)
6912 ,m_count(NULL)
6913 # elif defined(_WIN32)
6914 ,m_handles()
6915 # endif /* LINUX_NATIVE_AIO */
6916 {
6917 ut_a(n > 0);
6918 ut_a(m_n_segments > 0);
6919
6920 mutex_create(id, &m_mutex);
6921
6922 m_not_full = os_event_create("aio_not_full");
6923 m_is_empty = os_event_create("aio_is_empty");
6924
6925 std::uninitialized_fill(m_slots.begin(), m_slots.end(), Slot());
6926 #ifdef LINUX_NATIVE_AIO
6927 memset(&m_events[0], 0x0, sizeof(m_events[0]) * m_events.size());
6928 #endif /* LINUX_NATIVE_AIO */
6929
6930 os_event_set(m_is_empty);
6931 }
6932
6933 /** Initialise the slots */
6934 dberr_t
init_slots()6935 AIO::init_slots()
6936 {
6937 for (ulint i = 0; i < m_slots.size(); ++i) {
6938 Slot& slot = m_slots[i];
6939
6940 slot.pos = static_cast<uint16_t>(i);
6941
6942 slot.is_reserved = false;
6943
6944 #ifdef WIN_ASYNC_IO
6945
6946 slot.handle = CreateEvent(NULL, TRUE, FALSE, NULL);
6947
6948 OVERLAPPED* over = &slot.control;
6949
6950 over->hEvent = slot.handle;
6951
6952 (*m_handles)[i] = over->hEvent;
6953
6954 #elif defined(LINUX_NATIVE_AIO)
6955
6956 slot.ret = 0;
6957
6958 slot.n_bytes = 0;
6959
6960 memset(&slot.control, 0x0, sizeof(slot.control));
6961
6962 #endif /* WIN_ASYNC_IO */
6963 }
6964
6965 return(DB_SUCCESS);
6966 }
6967
6968 #ifdef LINUX_NATIVE_AIO
6969 /** Initialise the Linux Native AIO interface */
6970 dberr_t
init_linux_native_aio()6971 AIO::init_linux_native_aio()
6972 {
6973 /* Initialize the io_context array. One io_context
6974 per segment in the array. */
6975
6976 ut_a(m_aio_ctx == NULL);
6977
6978 m_aio_ctx = static_cast<io_context**>(
6979 ut_zalloc_nokey(m_n_segments * sizeof(*m_aio_ctx)));
6980
6981 if (m_aio_ctx == NULL) {
6982 return(DB_OUT_OF_MEMORY);
6983 }
6984
6985 io_context** ctx = m_aio_ctx;
6986 ulint max_events = slots_per_segment();
6987
6988 for (ulint i = 0; i < m_n_segments; ++i, ++ctx) {
6989
6990 if (!linux_create_io_ctx(max_events, ctx)) {
6991 /* If something bad happened during aio setup
6992 we should call it a day and return right away.
6993 We don't care about any leaks because a failure
6994 to initialize the io subsystem means that the
6995 server (or atleast the innodb storage engine)
6996 is not going to startup. */
6997 return(DB_IO_ERROR);
6998 }
6999 }
7000
7001 m_pending = static_cast<struct iocb**>(
7002 ut_zalloc_nokey(m_slots.size() * sizeof(struct iocb*)));
7003 m_count = static_cast<ulint*>(
7004 ut_zalloc_nokey(m_n_segments * sizeof(ulint)));
7005
7006 return(DB_SUCCESS);
7007 }
7008 #endif /* LINUX_NATIVE_AIO */
7009
7010 /** Initialise the array */
7011 dberr_t
init()7012 AIO::init()
7013 {
7014 ut_a(!m_slots.empty());
7015
7016 #ifdef _WIN32
7017 ut_a(m_handles == NULL);
7018
7019 m_handles = UT_NEW_NOKEY(Handles(m_slots.size()));
7020 #endif /* _WIN32 */
7021
7022 if (srv_use_native_aio) {
7023 #ifdef LINUX_NATIVE_AIO
7024 dberr_t err = init_linux_native_aio();
7025
7026 if (err != DB_SUCCESS) {
7027 return(err);
7028 }
7029
7030 #endif /* LINUX_NATIVE_AIO */
7031 }
7032
7033 return(init_slots());
7034 }
7035
7036 /** Creates an aio wait array. Note that we return NULL in case of failure.
7037 We don't care about freeing memory here because we assume that a
7038 failure will result in server refusing to start up.
7039 @param[in] id Latch ID
7040 @param[in] n maximum number of pending AIO operations
7041 allowed; n must be divisible by m_n_segments
7042 @param[in] n_segments number of segments in the AIO array
7043 @return own: AIO array, NULL on failure */
7044 AIO*
create(latch_id_t id,ulint n,ulint n_segments)7045 AIO::create(
7046 latch_id_t id,
7047 ulint n,
7048 ulint n_segments)
7049 {
7050 if ((n % n_segments)) {
7051
7052 ib::error()
7053 << "Maximum number of AIO operations must be "
7054 << "divisible by number of segments";
7055
7056 return(NULL);
7057 }
7058
7059 AIO* array = UT_NEW_NOKEY(AIO(id, n, n_segments));
7060
7061 if (array != NULL && array->init() != DB_SUCCESS) {
7062
7063 UT_DELETE(array);
7064
7065 array = NULL;
7066 }
7067
7068 return(array);
7069 }
7070
7071 /** AIO destructor */
~AIO()7072 AIO::~AIO()
7073 {
7074 #ifdef WIN_ASYNC_IO
7075 for (ulint i = 0; i < m_slots.size(); ++i) {
7076 CloseHandle(m_slots[i].handle);
7077 }
7078 #endif /* WIN_ASYNC_IO */
7079
7080 #ifdef _WIN32
7081 UT_DELETE(m_handles);
7082 #endif /* _WIN32 */
7083
7084 mutex_destroy(&m_mutex);
7085
7086 os_event_destroy(m_not_full);
7087 os_event_destroy(m_is_empty);
7088
7089 #if defined(LINUX_NATIVE_AIO)
7090 if (srv_use_native_aio) {
7091 m_events.clear();
7092 ut_free(m_aio_ctx);
7093 #ifdef UNIV_DEBUG
7094 if (m_pending) {
7095 for (size_t idx = 0; idx < m_slots.size(); ++idx)
7096 ut_ad(m_pending[idx] == NULL);
7097 }
7098 if (m_count) {
7099 for (size_t idx = 0; idx < m_n_segments; ++idx)
7100 ut_ad(m_count[idx] == 0);
7101 }
7102 #endif
7103 ut_free(m_pending);
7104 ut_free(m_count);
7105 }
7106 #endif /* LINUX_NATIVE_AIO */
7107
7108 m_slots.clear();
7109 }
7110
7111 /** Initializes the asynchronous io system. Creates one array each for ibuf
7112 and log i/o. Also creates one array each for read and write where each
7113 array is divided logically into n_readers and n_writers
7114 respectively. The caller must create an i/o handler thread for each
7115 segment in these arrays. This function also creates the sync array.
7116 No i/o handler thread needs to be created for that
7117 @param[in] n_per_seg maximum number of pending aio
7118 operations allowed per segment
7119 @param[in] n_readers number of reader threads
7120 @param[in] n_writers number of writer threads
7121 @param[in] n_slots_sync number of slots in the sync aio array
7122 @return true if the AIO sub-system was started successfully */
7123 bool
start(ulint n_per_seg,ulint n_readers,ulint n_writers,ulint n_slots_sync)7124 AIO::start(
7125 ulint n_per_seg,
7126 ulint n_readers,
7127 ulint n_writers,
7128 ulint n_slots_sync)
7129 {
7130 #if defined(LINUX_NATIVE_AIO)
7131 /* Check if native aio is supported on this system and tmpfs */
7132 if (srv_use_native_aio && !is_linux_native_aio_supported()) {
7133
7134 ib::warn() << "Linux Native AIO disabled.";
7135
7136 srv_use_native_aio = FALSE;
7137 }
7138 #endif /* LINUX_NATIVE_AIO */
7139
7140 srv_reset_io_thread_op_info();
7141
7142 s_reads = create(
7143 LATCH_ID_OS_AIO_READ_MUTEX, n_readers * n_per_seg, n_readers);
7144
7145 if (s_reads == NULL) {
7146 return(false);
7147 }
7148
7149 ulint start = srv_read_only_mode ? 0 : 2;
7150 ulint n_segs = n_readers + start;
7151
7152 /* 0 is the ibuf segment and 1 is the redo log segment. */
7153 for (ulint i = start; i < n_segs; ++i) {
7154 ut_a(i < SRV_MAX_N_IO_THREADS);
7155 srv_io_thread_function[i] = "read thread";
7156 }
7157
7158 ulint n_segments = n_readers;
7159
7160 if (!srv_read_only_mode) {
7161
7162 s_ibuf = create(LATCH_ID_OS_AIO_IBUF_MUTEX, n_per_seg, 1);
7163
7164 if (s_ibuf == NULL) {
7165 return(false);
7166 }
7167
7168 ++n_segments;
7169
7170 srv_io_thread_function[0] = "insert buffer thread";
7171
7172 s_log = create(LATCH_ID_OS_AIO_LOG_MUTEX, n_per_seg, 1);
7173
7174 if (s_log == NULL) {
7175 return(false);
7176 }
7177
7178 ++n_segments;
7179
7180 srv_io_thread_function[1] = "log thread";
7181
7182 } else {
7183 s_ibuf = s_log = NULL;
7184 }
7185
7186 s_writes = create(
7187 LATCH_ID_OS_AIO_WRITE_MUTEX, n_writers * n_per_seg, n_writers);
7188
7189 if (s_writes == NULL) {
7190 return(false);
7191 }
7192
7193 n_segments += n_writers;
7194
7195 for (ulint i = start + n_readers; i < n_segments; ++i) {
7196 ut_a(i < SRV_MAX_N_IO_THREADS);
7197 srv_io_thread_function[i] = "write thread";
7198 }
7199
7200 ut_ad(n_segments >= static_cast<ulint>(srv_read_only_mode ? 2 : 4));
7201
7202 s_sync = create(LATCH_ID_OS_AIO_SYNC_MUTEX, n_slots_sync, 1);
7203
7204 if (s_sync == NULL) {
7205
7206 return(false);
7207 }
7208
7209 os_aio_n_segments = n_segments;
7210
7211 os_aio_validate();
7212
7213 os_aio_segment_wait_events = static_cast<os_event_t*>(
7214 ut_zalloc_nokey(
7215 n_segments * sizeof *os_aio_segment_wait_events));
7216
7217 if (os_aio_segment_wait_events == NULL) {
7218
7219 return(false);
7220 }
7221
7222 for (ulint i = 0; i < n_segments; ++i) {
7223 os_aio_segment_wait_events[i] = os_event_create(0);
7224 }
7225
7226 os_last_printout = ut_time_monotonic();
7227
7228 return(true);
7229 }
7230
7231 /** Free the AIO arrays */
7232 void
shutdown()7233 AIO::shutdown()
7234 {
7235 UT_DELETE(s_ibuf);
7236 s_ibuf = NULL;
7237
7238 UT_DELETE(s_log);
7239 s_log = NULL;
7240
7241 UT_DELETE(s_writes);
7242 s_writes = NULL;
7243
7244 UT_DELETE(s_sync);
7245 s_sync = NULL;
7246
7247 UT_DELETE(s_reads);
7248 s_reads = NULL;
7249 }
7250
7251 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
7252
7253 /** Max disk sector size */
7254 static const ulint MAX_SECTOR_SIZE = 4096;
7255
7256 /**
7257 Try and get the FusionIO sector size. */
7258 void
os_fusionio_get_sector_size()7259 os_fusionio_get_sector_size()
7260 {
7261 if (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT
7262 || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC) {
7263 ulint sector_size = UNIV_SECTOR_SIZE;
7264 char* path = srv_data_home;
7265 os_file_t check_file;
7266 byte* ptr;
7267 byte* block_ptr;
7268 char current_dir[3];
7269 char* dir_end;
7270 ulint dir_len;
7271 ulint check_path_len;
7272 char* check_file_name;
7273 ssize_t ret;
7274
7275 /* If the srv_data_home is empty, set the path to
7276 current dir. */
7277 if (*path == 0) {
7278 current_dir[0] = FN_CURLIB;
7279 current_dir[1] = FN_LIBCHAR;
7280 current_dir[2] = 0;
7281 path = current_dir;
7282 }
7283
7284 /* Get the path of data file */
7285 dir_end = strrchr(path, OS_PATH_SEPARATOR);
7286 dir_len = dir_end? dir_end - path : strlen(path);
7287
7288 /* allocate a new path and move the directory path to it. */
7289 check_path_len = dir_len + sizeof "/check_sector_size";
7290 check_file_name = static_cast<char*>(
7291 ut_zalloc_nokey(check_path_len));
7292 memcpy(check_file_name, path, dir_len);
7293
7294 /* Construct a check file name. */
7295 strcat(check_file_name + dir_len, "/check_sector_size");
7296
7297 /* Create a tmp file for checking sector size. */
7298 check_file = ::open(check_file_name,
7299 O_CREAT|O_TRUNC|O_WRONLY|O_DIRECT,
7300 S_IRWXU);
7301
7302 if (check_file == -1) {
7303 ib::error()
7304 << "Failed to create check sector file, errno:"
7305 << errno << " Please confirm O_DIRECT is"
7306 << " supported and remove the file "
7307 << check_file_name << " if it exists.";
7308 ut_free(check_file_name);
7309 errno = 0;
7310 return;
7311 }
7312
7313 /* Try to write the file with different sector size
7314 alignment. */
7315 ptr = static_cast<byte*>(ut_malloc_nokey(2 * MAX_SECTOR_SIZE));
7316
7317 while (sector_size <= MAX_SECTOR_SIZE) {
7318 block_ptr = static_cast<byte*>(
7319 ut_align(ptr, sector_size));
7320 ret = pwrite(check_file, block_ptr,
7321 sector_size, 0);
7322 if (ret > 0 && (ulint) ret == sector_size) {
7323 break;
7324 }
7325 sector_size *= 2;
7326 }
7327
7328 /* The sector size should <= MAX_SECTOR_SIZE. */
7329 ut_ad(sector_size <= MAX_SECTOR_SIZE);
7330
7331 close(check_file);
7332 unlink(check_file_name);
7333
7334 ut_free(check_file_name);
7335 ut_free(ptr);
7336 errno = 0;
7337
7338 os_io_ptr_align = sector_size;
7339 }
7340 }
7341 #endif /* !NO_FALLOCATE && UNIV_LINUX */
7342
7343 /** Initializes the asynchronous io system. Creates one array each for ibuf
7344 and log i/o. Also creates one array each for read and write where each
7345 array is divided logically into n_readers and n_writers
7346 respectively. The caller must create an i/o handler thread for each
7347 segment in these arrays. This function also creates the sync array.
7348 No i/o handler thread needs to be created for that
7349 @param[in] n_readers number of reader threads
7350 @param[in] n_writers number of writer threads
7351 @param[in] n_slots_sync number of slots in the sync aio array */
7352 bool
os_aio_init(ulint n_readers,ulint n_writers,ulint n_slots_sync)7353 os_aio_init(
7354 ulint n_readers,
7355 ulint n_writers,
7356 ulint n_slots_sync)
7357 {
7358 /* Maximum number of pending aio operations allowed per segment */
7359 ulint limit = 8 * OS_AIO_N_PENDING_IOS_PER_THREAD;
7360
7361 #ifdef _WIN32
7362 if (srv_use_native_aio) {
7363 limit = SRV_N_PENDING_IOS_PER_THREAD;
7364 }
7365 #endif /* _WIN32 */
7366
7367 ut_a(block_cache == NULL);
7368
7369 block_cache = UT_NEW_NOKEY(Blocks(MAX_BLOCKS));
7370
7371 for (Blocks::iterator it = block_cache->begin();
7372 it != block_cache->end();
7373 ++it) {
7374
7375 ut_a(it->m_in_use == 0);
7376 ut_a(it->m_ptr == NULL);
7377
7378 /* Allocate double of max page size memory, since
7379 compress could generate more bytes than orgininal
7380 data. */
7381 it->m_ptr = static_cast<byte*>(
7382 ut_malloc_nokey(BUFFER_BLOCK_SIZE));
7383
7384 ut_a(it->m_ptr != NULL);
7385 }
7386
7387 /* Get sector size for DIRECT_IO. In this case, we need to
7388 know the sector size for aligning the write buffer. */
7389 #if !defined(NO_FALLOCATE) && defined(UNIV_LINUX)
7390 os_fusionio_get_sector_size();
7391 #endif /* !NO_FALLOCATE && UNIV_LINUX */
7392
7393 return(AIO::start(limit, n_readers, n_writers, n_slots_sync));
7394 }
7395
7396 /** Frees the asynchronous io system. */
7397 void
os_aio_free()7398 os_aio_free()
7399 {
7400 AIO::shutdown();
7401
7402 for (ulint i = 0; i < os_aio_n_segments; i++) {
7403 os_event_destroy(os_aio_segment_wait_events[i]);
7404 }
7405
7406 ut_free(os_aio_segment_wait_events);
7407 os_aio_segment_wait_events = 0;
7408 os_aio_n_segments = 0;
7409
7410 for (Blocks::iterator it = block_cache->begin();
7411 it != block_cache->end();
7412 ++it) {
7413
7414 ut_a(it->m_in_use == 0);
7415 ut_free(it->m_ptr);
7416 }
7417
7418 UT_DELETE(block_cache);
7419
7420 block_cache = NULL;
7421 }
7422
7423 /** Wakes up all async i/o threads so that they know to exit themselves in
7424 shutdown. */
7425 void
os_aio_wake_all_threads_at_shutdown()7426 os_aio_wake_all_threads_at_shutdown()
7427 {
7428 #ifdef WIN_ASYNC_IO
7429
7430 AIO::wake_at_shutdown();
7431
7432 #elif defined(LINUX_NATIVE_AIO)
7433
7434 /* When using native AIO interface the io helper threads
7435 wait on io_getevents with a timeout value of 500ms. At
7436 each wake up these threads check the server status.
7437 No need to do anything to wake them up. */
7438
7439 if (srv_use_native_aio) {
7440 return;
7441 }
7442
7443 #endif /* !WIN_ASYNC_AIO */
7444
7445 /* Fall through to simulated AIO handler wakeup if we are
7446 not using native AIO. */
7447
7448 /* This loop wakes up all simulated ai/o threads */
7449
7450 for (ulint i = 0; i < os_aio_n_segments; ++i) {
7451
7452 os_event_set(os_aio_segment_wait_events[i]);
7453 }
7454 }
7455
7456 /** Waits until there are no pending writes in AIO::s_writes. There can
7457 be other, synchronous, pending writes. */
7458 void
os_aio_wait_until_no_pending_writes()7459 os_aio_wait_until_no_pending_writes()
7460 {
7461 AIO::wait_until_no_pending_writes();
7462 }
7463
7464 /** Calculates segment number for a slot.
7465 @param[in] array AIO wait array
7466 @param[in] slot slot in this array
7467 @return segment number (which is the number used by, for example,
7468 I/O-handler threads) */
7469 ulint
get_segment_no_from_slot(const AIO * array,const Slot * slot)7470 AIO::get_segment_no_from_slot(
7471 const AIO* array,
7472 const Slot* slot)
7473 {
7474 ulint segment;
7475 ulint seg_len;
7476
7477 if (array == s_ibuf) {
7478 ut_ad(!srv_read_only_mode);
7479
7480 segment = IO_IBUF_SEGMENT;
7481
7482 } else if (array == s_log) {
7483 ut_ad(!srv_read_only_mode);
7484
7485 segment = IO_LOG_SEGMENT;
7486
7487 } else if (array == s_reads) {
7488 seg_len = s_reads->slots_per_segment();
7489
7490 segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
7491 } else {
7492 ut_a(array == s_writes);
7493
7494 seg_len = s_writes->slots_per_segment();
7495
7496 segment = s_reads->m_n_segments
7497 + (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len;
7498 }
7499
7500 return(segment);
7501 }
7502
7503 /** Requests for a slot in the aio array. If no slot is available, waits until
7504 not_full-event becomes signaled.
7505
7506 @param[in,out] type IO context
7507 @param[in,out] m1 message to be passed along with the AIO
7508 operation
7509 @param[in,out] m2 message to be passed along with the AIO
7510 operation
7511 @param[in] file file handle
7512 @param[in] name name of the file or path as a NUL-terminated
7513 string
7514 @param[in,out] buf buffer where to read or from which to write
7515 @param[in] offset file offset, where to read from or start writing
7516 @param[in] len length of the block to read or write
7517 @return pointer to slot */
7518 Slot*
reserve_slot(IORequest & type,fil_node_t * m1,void * m2,pfs_os_file_t file,const char * name,void * buf,os_offset_t offset,ulint len,ulint space_id)7519 AIO::reserve_slot(
7520 IORequest& type,
7521 fil_node_t* m1,
7522 void* m2,
7523 pfs_os_file_t file,
7524 const char* name,
7525 void* buf,
7526 os_offset_t offset,
7527 ulint len,
7528 ulint space_id)
7529 {
7530 #ifdef WIN_ASYNC_IO
7531 ut_a((len & 0xFFFFFFFFUL) == len);
7532 #endif /* WIN_ASYNC_IO */
7533
7534 /* No need of a mutex. Only reading constant fields */
7535 ulint slots_per_seg;
7536
7537 ut_ad(type.validate());
7538
7539 slots_per_seg = slots_per_segment();
7540
7541 /* We attempt to keep adjacent blocks in the same local
7542 segment. This can help in merging IO requests when we are
7543 doing simulated AIO */
7544 ulint local_seg;
7545
7546 local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) % m_n_segments;
7547
7548 for (;;) {
7549
7550 acquire();
7551
7552 if (m_n_reserved != m_slots.size()) {
7553 break;
7554 }
7555
7556 release();
7557
7558 if (!srv_use_native_aio) {
7559 /* If the handler threads are suspended,
7560 wake them so that we get more slots */
7561
7562 os_aio_simulated_wake_handler_threads();
7563 }
7564
7565 os_event_wait(m_not_full);
7566 }
7567
7568 ulint counter = 0;
7569 Slot* slot = NULL;
7570
7571 /* We start our search for an available slot from our preferred
7572 local segment and do a full scan of the array. We are
7573 guaranteed to find a slot in full scan. */
7574 for (ulint i = local_seg * slots_per_seg;
7575 counter < m_slots.size();
7576 ++i, ++counter) {
7577
7578 i %= m_slots.size();
7579
7580 slot = at(i);
7581
7582 if (slot->is_reserved == false) {
7583 break;
7584 }
7585 }
7586
7587 /* We MUST always be able to get hold of a reserved slot. */
7588 ut_a(counter < m_slots.size());
7589
7590 ut_a(slot->is_reserved == false);
7591
7592 ++m_n_reserved;
7593
7594 if (m_n_reserved == 1) {
7595 os_event_reset(m_is_empty);
7596 }
7597
7598 if (m_n_reserved == m_slots.size()) {
7599 os_event_reset(m_not_full);
7600 }
7601
7602 slot->is_reserved = true;
7603 slot->reservation_time = ut_time_monotonic();
7604 slot->m1 = m1;
7605 slot->m2 = m2;
7606 slot->file = file;
7607 slot->name = name;
7608 #ifdef _WIN32
7609 slot->len = static_cast<DWORD>(len);
7610 #else
7611 slot->len = static_cast<ulint>(len);
7612 #endif /* _WIN32 */
7613 slot->type = type;
7614 slot->buf = static_cast<byte*>(buf);
7615 slot->ptr = slot->buf;
7616 slot->offset = offset;
7617 slot->err = DB_SUCCESS;
7618 slot->original_len = static_cast<uint32>(len);
7619 slot->io_already_done = false;
7620 slot->space_id = space_id;
7621 slot->buf_block = NULL;
7622 slot->encrypt_log_buf = NULL;
7623
7624 if (srv_use_native_aio
7625 && offset > 0
7626 && type.is_write()
7627 && type.is_compressed()) {
7628 ulint compressed_len = len;
7629
7630 ut_ad(!type.is_log());
7631
7632 release();
7633
7634 void* src_buf = slot->buf;
7635 slot->buf_block = os_file_compress_page(
7636 type,
7637 src_buf,
7638 &compressed_len);
7639
7640 slot->buf = static_cast<byte*>(src_buf);
7641 slot->ptr = slot->buf;
7642 #ifdef _WIN32
7643 slot->len = static_cast<DWORD>(compressed_len);
7644 #else
7645 slot->len = static_cast<ulint>(compressed_len);
7646 #endif /* _WIN32 */
7647 slot->skip_punch_hole = !type.punch_hole();
7648
7649 acquire();
7650 }
7651
7652 /* We do encryption after compression, since if we do encryption
7653 before compression, the encrypted data will cause compression fail
7654 or low compression rate. */
7655 if (srv_use_native_aio
7656 && offset > 0
7657 && type.is_write()
7658 && type.is_encrypted()
7659 && (type.encryption_algorithm().m_type != Encryption::KEYRING ||
7660 (type.encryption_algorithm().m_key != NULL &&
7661 Encryption::can_page_be_keyring_encrypted(slot->buf)))) {
7662
7663 ulint encrypted_len = slot->len;
7664 Block* encrypted_block;
7665 byte* encrypt_log_buf;
7666
7667 release();
7668
7669 void* src_buf = slot->buf;
7670 if (!type.is_log()) {
7671 encrypted_block = os_file_encrypt_page(
7672 type,
7673 src_buf,
7674 &encrypted_len);
7675
7676 if (slot->buf_block != NULL) {
7677 os_free_block(slot->buf_block);
7678 }
7679
7680 slot->buf_block = encrypted_block;
7681 } else {
7682 /* Skip encrypted log file header */
7683 if (offset >= LOG_FILE_HDR_SIZE) {
7684 encrypted_block = os_file_encrypt_log(
7685 type,
7686 src_buf,
7687 encrypt_log_buf,
7688 &encrypted_len);
7689
7690 if (slot->buf_block != NULL) {
7691 os_free_block(slot->buf_block);
7692 }
7693
7694 slot->buf_block = encrypted_block;
7695
7696 if (slot->encrypt_log_buf != NULL) {
7697 ut_free(slot->encrypt_log_buf);
7698 }
7699
7700 slot->encrypt_log_buf = encrypt_log_buf;
7701 }
7702 }
7703
7704 slot->buf = static_cast<byte*>(src_buf);
7705 slot->ptr = slot->buf;
7706
7707 #ifdef _WIN32
7708 slot->len = static_cast<DWORD>(encrypted_len);
7709 #else
7710 slot->len = static_cast<ulint>(encrypted_len);
7711 #endif /* _WIN32 */
7712
7713 acquire();
7714 }
7715
7716 #ifdef WIN_ASYNC_IO
7717 {
7718 OVERLAPPED* control;
7719
7720 control = &slot->control;
7721 control->Offset = (DWORD) offset & 0xFFFFFFFF;
7722 control->OffsetHigh = (DWORD) (offset >> 32);
7723
7724 ResetEvent(slot->handle);
7725 }
7726 #elif defined(LINUX_NATIVE_AIO)
7727
7728 /* If we are not using native AIO skip this part. */
7729 if (srv_use_native_aio) {
7730
7731 off_t aio_offset;
7732
7733 /* Check if we are dealing with 64 bit arch.
7734 If not then make sure that offset fits in 32 bits. */
7735 aio_offset = (off_t) offset;
7736
7737 ut_a(sizeof(aio_offset) >= sizeof(offset)
7738 || ((os_offset_t) aio_offset) == offset);
7739
7740 struct iocb* iocb = &slot->control;
7741
7742 if (type.is_read()) {
7743 io_prep_pread(
7744 iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7745 } else {
7746 ut_ad(type.is_write());
7747 io_prep_pwrite(
7748 iocb, file.m_file, slot->ptr, slot->len, aio_offset);
7749 }
7750
7751 iocb->data = slot;
7752
7753 slot->n_bytes = 0;
7754 slot->ret = 0;
7755 }
7756 #endif /* LINUX_NATIVE_AIO */
7757
7758 release();
7759
7760 return(slot);
7761 }
7762
7763 /** Wakes up a simulated aio i/o-handler thread if it has something to do.
7764 @param[in] global_segment The number of the segment in the AIO arrays */
7765 void
wake_simulated_handler_thread(ulint global_segment)7766 AIO::wake_simulated_handler_thread(ulint global_segment)
7767 {
7768 ut_ad(!srv_use_native_aio);
7769
7770 AIO* array;
7771 ulint segment = get_array_and_local_segment(&array, global_segment);
7772
7773 array->wake_simulated_handler_thread(global_segment, segment);
7774 }
7775
7776 /** Wakes up a simulated AIO I/O-handler thread if it has something to do
7777 for a local segment in the AIO array.
7778 @param[in] global_segment The number of the segment in the AIO arrays
7779 @param[in] segment The local segment in the AIO array */
7780 void
wake_simulated_handler_thread(ulint global_segment,ulint segment)7781 AIO::wake_simulated_handler_thread(ulint global_segment, ulint segment)
7782 {
7783 ut_ad(!srv_use_native_aio);
7784
7785 ulint n = slots_per_segment();
7786 ulint offset = segment * n;
7787
7788 /* Look through n slots after the segment * n'th slot */
7789
7790 acquire();
7791
7792 const Slot* slot = at(offset);
7793
7794 for (ulint i = 0; i < n; ++i, ++slot) {
7795
7796 if (slot->is_reserved) {
7797
7798 /* Found an i/o request */
7799
7800 release();
7801
7802 os_event_t event;
7803
7804 event = os_aio_segment_wait_events[global_segment];
7805
7806 os_event_set(event);
7807
7808 return;
7809 }
7810 }
7811
7812 release();
7813 }
7814
7815 /** Wakes up simulated aio i/o-handler threads if they have something to do. */
7816 void
os_aio_simulated_wake_handler_threads()7817 os_aio_simulated_wake_handler_threads()
7818 {
7819 if (srv_use_native_aio) {
7820 /* We do not use simulated aio: do nothing */
7821
7822 return;
7823 }
7824
7825 os_aio_recommend_sleep_for_read_threads = false;
7826
7827 for (ulint i = 0; i < os_aio_n_segments; i++) {
7828 AIO::wake_simulated_handler_thread(i);
7829 }
7830 }
7831
7832 /** Select the IO slot array
7833 @param[in] type Type of IO, READ or WRITE
7834 @param[in] read_only true if running in read-only mode
7835 @param[in] mode IO mode
7836 @return slot array or NULL if invalid mode specified */
7837 AIO*
select_slot_array(IORequest & type,bool read_only,ulint mode)7838 AIO::select_slot_array(IORequest& type, bool read_only, ulint mode)
7839 {
7840 AIO* array;
7841
7842 ut_ad(type.validate());
7843
7844 switch (mode) {
7845 case OS_AIO_NORMAL:
7846
7847 array = type.is_read() ? AIO::s_reads : AIO::s_writes;
7848 break;
7849
7850 case OS_AIO_IBUF:
7851 ut_ad(type.is_read());
7852
7853 /* Reduce probability of deadlock bugs in connection with ibuf:
7854 do not let the ibuf i/o handler sleep */
7855
7856 type.clear_do_not_wake();
7857
7858 array = read_only ? AIO::s_reads : AIO::s_ibuf;
7859 break;
7860
7861 case OS_AIO_LOG:
7862
7863 array = read_only ? AIO::s_reads : AIO::s_log;
7864 break;
7865
7866 case OS_AIO_SYNC:
7867
7868 array = AIO::s_sync;
7869 #if defined(LINUX_NATIVE_AIO)
7870 /* In Linux native AIO we don't use sync IO array. */
7871 ut_a(!srv_use_native_aio);
7872 #endif /* LINUX_NATIVE_AIO */
7873 break;
7874
7875 default:
7876 ut_error;
7877 array = NULL; /* Eliminate compiler warning */
7878 }
7879
7880 return(array);
7881 }
7882
7883 #ifdef WIN_ASYNC_IO
7884 /** This function is only used in Windows asynchronous i/o.
7885 Waits for an aio operation to complete. This function is used to wait the
7886 for completed requests. The aio array of pending requests is divided
7887 into segments. The thread specifies which segment or slot it wants to wait
7888 for. NOTE: this function will also take care of freeing the aio slot,
7889 therefore no other thread is allowed to do the freeing!
7890 @param[in] segment The number of the segment in the aio arrays to
7891 wait for; segment 0 is the ibuf I/O thread,
7892 segment 1 the log I/O thread, then follow the
7893 non-ibuf read threads, and as the last are the
7894 non-ibuf write threads; if this is
7895 ULINT_UNDEFINED, then it means that sync AIO
7896 is used, and this parameter is ignored
7897 @param[in] pos this parameter is used only in sync AIO:
7898 wait for the aio slot at this position
7899 @param[out] m1 the messages passed with the AIO request; note
7900 that also in the case where the AIO operation
7901 failed, these output parameters are valid and
7902 can be used to restart the operation,
7903 for example
7904 @param[out] m2 callback message
7905 @param[out] type OS_FILE_WRITE or ..._READ
7906 @return DB_SUCCESS or error code */
7907 static
7908 dberr_t
os_aio_windows_handler(ulint segment,ulint pos,fil_node_t ** m1,void ** m2,IORequest * type)7909 os_aio_windows_handler(
7910 ulint segment,
7911 ulint pos,
7912 fil_node_t** m1,
7913 void** m2,
7914 IORequest* type)
7915 {
7916 Slot* slot;
7917 dberr_t err;
7918 AIO* array;
7919 ulint orig_seg = segment;
7920
7921 if (segment == ULINT_UNDEFINED) {
7922 segment = 0;
7923 array = AIO::sync_array();
7924 } else {
7925 segment = AIO::get_array_and_local_segment(&array, segment);
7926 }
7927
7928 /* NOTE! We only access constant fields in os_aio_array. Therefore
7929 we do not have to acquire the protecting mutex yet */
7930
7931 ut_ad(os_aio_validate_skip());
7932
7933 if (array == AIO::sync_array()) {
7934
7935 WaitForSingleObject(array->at(pos)->handle, INFINITE);
7936
7937 } else {
7938 if (orig_seg != ULINT_UNDEFINED) {
7939 srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
7940 }
7941
7942 pos = WaitForMultipleObjects(
7943 (DWORD) array->slots_per_segment(),
7944 array->handles(segment),
7945 FALSE, INFINITE);
7946 }
7947
7948 array->acquire();
7949
7950 if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS
7951 && array->is_empty()
7952 && !buf_page_cleaner_is_active) {
7953
7954 *m1 = NULL;
7955 *m2 = NULL;
7956
7957 array->release();
7958
7959 return(DB_SUCCESS);
7960 }
7961
7962 ulint n = array->slots_per_segment();
7963
7964 ut_a(pos >= WAIT_OBJECT_0 && pos <= WAIT_OBJECT_0 + n);
7965
7966 slot = array->at(pos + segment * n);
7967
7968 ut_a(slot->is_reserved);
7969
7970 if (orig_seg != ULINT_UNDEFINED) {
7971 srv_set_io_thread_op_info(
7972 orig_seg, "get windows aio return value");
7973 }
7974
7975 BOOL ret;
7976 ret = GetOverlappedResult(
7977 slot->file.m_file, &slot->control, &slot->n_bytes, TRUE);
7978 *m1 = slot->m1;
7979 *m2 = slot->m2;
7980
7981 *type = slot->type;
7982
7983 BOOL retry = FALSE;
7984
7985 if (ret && slot->n_bytes == slot->len) {
7986
7987 err = DB_SUCCESS;
7988
7989 } else if (os_file_handle_error(slot->name, "Windows aio")) {
7990
7991 retry = true;
7992
7993 } else {
7994
7995 err = DB_IO_ERROR;
7996 }
7997
7998 array->release();
7999
8000 if (retry) {
8001 /* Retry failed read/write operation synchronously.
8002 No need to hold array->m_mutex. */
8003
8004 #ifdef UNIV_PFS_IO
8005 /* This read/write does not go through os_file_read
8006 and os_file_write APIs, need to register with
8007 performance schema explicitly here. */
8008 struct PSI_file_locker* locker = NULL;
8009 PSI_file_locker_state state;
8010 register_pfs_file_io_begin(
8011 &state, locker, slot->file, slot->len,
8012 slot->type.is_write()
8013 ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__);
8014 #endif /* UNIV_PFS_IO */
8015
8016 ut_a((slot->len & 0xFFFFFFFFUL) == slot->len);
8017
8018 ssize_t n_bytes = SyncFileIO::execute(slot);
8019
8020 #ifdef UNIV_PFS_IO
8021 register_pfs_file_io_end(locker, slot->len);
8022 #endif /* UNIV_PFS_IO */
8023
8024 if (n_bytes < 0 && GetLastError() == ERROR_IO_PENDING) {
8025 /* AIO was queued successfully!
8026 We want a synchronous I/O operation on a
8027 file where we also use async I/O: in Windows
8028 we must use the same wait mechanism as for
8029 async I/O */
8030
8031 BOOL ret;
8032 ret = GetOverlappedResult(
8033 slot->file.m_file, &slot->control, &slot->n_bytes,
8034 TRUE);
8035 n_bytes = ret ? slot->n_bytes : -1;
8036 }
8037
8038 err = (n_bytes == slot->len) ? DB_SUCCESS : DB_IO_ERROR;
8039 }
8040
8041 if (err == DB_SUCCESS) {
8042 err = AIOHandler::post_io_processing(slot);
8043 }
8044
8045 array->release_with_mutex(slot);
8046
8047 return(err);
8048 }
8049 #endif /* WIN_ASYNC_IO */
8050
8051 /**
8052 NOTE! Use the corresponding macro os_aio(), not directly this function!
8053 Requests an asynchronous i/o operation.
8054 @param[in] type IO request context
8055 @param[in] mode IO mode
8056 @param[in] name Name of the file or path as NUL terminated
8057 string
8058 @param[in] file Open file handle
8059 @param[out] buf buffer where to read
8060 @param[in] offset file offset where to read
8061 @param[in] n number of bytes to read
8062 @param[in] read_only if true read only mode checks are enforced
8063 @param[in,out] m1 Message for the AIO handler, (can be used to
8064 identify a completed AIO operation); ignored
8065 if mode is OS_AIO_SYNC
8066 @param[in,out] m2 message for the AIO handler (can be used to
8067 identify a completed AIO operation); ignored
8068 if mode is OS_AIO_SYNC
8069 @param[in] should_buffer Whether to buffer an aio request.
8070 AIO read ahead uses this. If you plan to
8071 use this parameter, make sure you remember to
8072 call os_aio_dispatch_read_array_submit()
8073 when you're ready to commit all your
8074 requests.
8075
8076 @return DB_SUCCESS or error code */
8077 dberr_t
os_aio_func(IORequest & type,ulint mode,const char * name,pfs_os_file_t file,void * buf,os_offset_t offset,ulint n,bool read_only,fil_node_t * m1,void * m2,ulint space_id,trx_t * trx,bool should_buffer)8078 os_aio_func(
8079 IORequest& type,
8080 ulint mode,
8081 const char* name,
8082 pfs_os_file_t file,
8083 void* buf,
8084 os_offset_t offset,
8085 ulint n,
8086 bool read_only,
8087 fil_node_t* m1,
8088 void* m2,
8089 ulint space_id,
8090 trx_t* trx,
8091 bool should_buffer)
8092 {
8093 #ifdef WIN_ASYNC_IO
8094 BOOL ret = TRUE;
8095 #endif /* WIN_ASYNC_IO */
8096
8097 ut_ad(n > 0);
8098 ut_ad((n % OS_FILE_LOG_BLOCK_SIZE) == 0);
8099 ut_ad((offset % OS_FILE_LOG_BLOCK_SIZE) == 0);
8100 ut_ad(os_aio_validate_skip());
8101
8102 #ifdef WIN_ASYNC_IO
8103 ut_ad((n & 0xFFFFFFFFUL) == n);
8104 #endif /* WIN_ASYNC_IO */
8105
8106 if (mode == OS_AIO_SYNC
8107 #ifdef WIN_ASYNC_IO
8108 && !srv_use_native_aio
8109 #endif /* WIN_ASYNC_IO */
8110 ) {
8111 /* This is actually an ordinary synchronous read or write:
8112 no need to use an i/o-handler thread. NOTE that if we use
8113 Windows async i/o, Windows does not allow us to use
8114 ordinary synchronous os_file_read etc. on the same file,
8115 therefore we have built a special mechanism for synchronous
8116 wait in the Windows case.
8117 Also note that the Performance Schema instrumentation has
8118 been performed by current os_aio_func()'s wrapper function
8119 pfs_os_aio_func(). So we would no longer need to call
8120 Performance Schema instrumented os_file_read() and
8121 os_file_write(). Instead, we should use os_file_read_func()
8122 and os_file_write_func() */
8123
8124 if (type.is_read()) {
8125 return(os_file_read_func(type, file.m_file, buf,
8126 offset, n, trx));
8127 }
8128
8129 ut_ad(type.is_write());
8130 return(os_file_write_func(type, name, file.m_file, buf, offset, n));
8131 }
8132
8133 try_again:
8134
8135 AIO* array;
8136
8137 array = AIO::select_slot_array(type, read_only, mode);
8138
8139 Slot* slot;
8140
8141 slot = array->reserve_slot(type, m1, m2, file, name, buf, offset, n,
8142 space_id);
8143
8144 if (type.is_read()) {
8145 trx_stats::bump_io_read(trx, n);
8146
8147 if (srv_use_native_aio) {
8148
8149 ++os_n_file_reads;
8150
8151 os_bytes_read_since_printout += n;
8152 #ifdef WIN_ASYNC_IO
8153 ret = ReadFile(
8154 file.m_file, slot->ptr, slot->len,
8155 &slot->n_bytes, &slot->control);
8156 #elif defined(LINUX_NATIVE_AIO)
8157 if (!array->linux_dispatch(slot, should_buffer)) {
8158 goto err_exit;
8159 }
8160 #endif /* WIN_ASYNC_IO */
8161 } else if (type.is_wake()) {
8162 AIO::wake_simulated_handler_thread(
8163 AIO::get_segment_no_from_slot(array, slot));
8164 }
8165 } else if (type.is_write()) {
8166
8167 if (srv_use_native_aio) {
8168 ++os_n_file_writes;
8169
8170 #ifdef WIN_ASYNC_IO
8171 ret = WriteFile(
8172 file.m_file, slot->ptr, slot->len,
8173 &slot->n_bytes, &slot->control);
8174 #elif defined(LINUX_NATIVE_AIO)
8175 if (!array->linux_dispatch(slot, false)) {
8176 goto err_exit;
8177 }
8178 #endif /* WIN_ASYNC_IO */
8179
8180 } else if (type.is_wake()) {
8181 AIO::wake_simulated_handler_thread(
8182 AIO::get_segment_no_from_slot(array, slot));
8183 }
8184 } else {
8185 ut_error;
8186 }
8187
8188 #ifdef WIN_ASYNC_IO
8189 if (srv_use_native_aio) {
8190 if ((ret && slot->len == slot->n_bytes)
8191 || (!ret && GetLastError() == ERROR_IO_PENDING)) {
8192 /* aio was queued successfully! */
8193
8194 if (mode == OS_AIO_SYNC) {
8195 IORequest dummy_type;
8196 void* dummy_mess2;
8197 struct fil_node_t* dummy_mess1;
8198
8199 /* We want a synchronous i/o operation on a
8200 file where we also use async i/o: in Windows
8201 we must use the same wait mechanism as for
8202 async i/o */
8203
8204 return(os_aio_windows_handler(
8205 ULINT_UNDEFINED, slot->pos,
8206 &dummy_mess1, &dummy_mess2,
8207 &dummy_type));
8208 }
8209
8210 return(DB_SUCCESS);
8211 }
8212
8213 goto err_exit;
8214 }
8215 #endif /* WIN_ASYNC_IO */
8216
8217 /* AIO request was queued successfully! */
8218 return(DB_SUCCESS);
8219
8220 #if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO
8221 err_exit:
8222 #endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */
8223
8224 array->release_with_mutex(slot);
8225
8226 if (os_file_handle_error(
8227 name, type.is_read() ? "aio read" : "aio write")) {
8228
8229 goto try_again;
8230 }
8231
8232 return(DB_IO_ERROR);
8233 }
8234
8235 /** Simulated AIO handler for reaping IO requests */
8236 class SimulatedAIOHandler {
8237
8238 public:
8239
8240 /** Constructor
8241 @param[in,out] array The AIO array
8242 @param[in] segment Local segment in the array */
SimulatedAIOHandler(AIO * array,ulint segment)8243 SimulatedAIOHandler(AIO* array, ulint segment)
8244 :
8245 m_oldest(),
8246 m_n_elems(),
8247 m_lowest_offset(IB_UINT64_MAX),
8248 m_array(array),
8249 m_n_slots(),
8250 m_segment(segment),
8251 m_ptr(),
8252 m_buf()
8253 {
8254 ut_ad(m_segment < 100);
8255
8256 m_slots.resize(OS_AIO_MERGE_N_CONSECUTIVE);
8257 }
8258
8259 /** Destructor */
~SimulatedAIOHandler()8260 ~SimulatedAIOHandler()
8261 {
8262 if (m_ptr != NULL) {
8263 ut_free(m_ptr);
8264 }
8265 }
8266
8267 /** Reset the state of the handler
8268 @param[in] n_slots Number of pending AIO operations supported */
init(ulint n_slots)8269 void init(ulint n_slots)
8270 {
8271 m_oldest = 0;
8272 m_n_elems = 0;
8273 m_n_slots = n_slots;
8274 m_lowest_offset = IB_UINT64_MAX;
8275
8276 if (m_ptr != NULL) {
8277 ut_free(m_ptr);
8278 m_ptr = m_buf = NULL;
8279 }
8280
8281 m_slots[0] = NULL;
8282 }
8283
8284 /** Check if there is a slot for which the i/o has already been done
8285 @param[out] n_reserved Number of reserved slots
8286 @return the first completed slot that is found. */
check_completed(ulint * n_reserved)8287 Slot* check_completed(ulint* n_reserved)
8288 {
8289 ulint offset = m_segment * m_n_slots;
8290
8291 *n_reserved = 0;
8292
8293 Slot* slot;
8294
8295 slot = m_array->at(offset);
8296
8297 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
8298
8299 if (slot->is_reserved) {
8300
8301 if (slot->io_already_done) {
8302
8303 ut_a(slot->is_reserved);
8304
8305 return(slot);
8306 }
8307
8308 ++*n_reserved;
8309 }
8310 }
8311
8312 return(NULL);
8313 }
8314
8315 /** If there are at least 2 seconds old requests, then pick the
8316 oldest one to prevent starvation. If several requests have the
8317 same age, then pick the one at the lowest offset.
8318 @return true if request was selected */
select()8319 bool select()
8320 {
8321 if (!select_oldest()) {
8322
8323 return(select_lowest_offset());
8324 }
8325
8326 return(true);
8327 }
8328
8329 /** Check if there are several consecutive blocks
8330 to read or write. Merge them if found. */
merge()8331 void merge()
8332 {
8333 /* if m_n_elems != 0, then we have assigned
8334 something valid to consecutive_ios[0] */
8335 ut_ad(m_n_elems != 0);
8336 ut_ad(first_slot() != NULL);
8337
8338 Slot* slot = first_slot();
8339
8340 while (!merge_adjacent(slot)) {
8341 /* No op */
8342 }
8343 }
8344
8345 /** We have now collected n_consecutive I/O requests
8346 in the array; allocate a single buffer which can hold
8347 all data, and perform the I/O
8348 @return the length of the buffer */
allocate_buffer()8349 ulint allocate_buffer()
8350 MY_ATTRIBUTE((warn_unused_result))
8351 {
8352 ulint len;
8353 Slot* slot = first_slot();
8354
8355 ut_ad(m_ptr == NULL);
8356
8357 if (slot->type.is_read() && m_n_elems > 1) {
8358
8359 len = 0;
8360
8361 for (ulint i = 0; i < m_n_elems; ++i) {
8362 len += m_slots[i]->len;
8363 }
8364
8365 m_ptr = static_cast<byte*>(
8366 ut_malloc_nokey(len + UNIV_PAGE_SIZE));
8367
8368 m_buf = static_cast<byte*>(
8369 ut_align(m_ptr, UNIV_PAGE_SIZE));
8370
8371 } else {
8372 len = first_slot()->len;
8373 m_buf = first_slot()->buf;
8374 }
8375
8376 return(len);
8377 }
8378
8379 /** We have to compress the individual pages and punch
8380 holes in them on a page by page basis when writing to
8381 tables that can be compresed at the IO level.
8382 @param[in] len Value returned by allocate_buffer */
copy_to_buffer(ulint len)8383 void copy_to_buffer(ulint len)
8384 {
8385 Slot* slot = first_slot();
8386
8387 if (len > slot->len && slot->type.is_write()) {
8388
8389 byte* ptr = m_buf;
8390
8391 ut_ad(ptr != slot->buf);
8392
8393 /* Copy the buffers to the combined buffer */
8394 for (ulint i = 0; i < m_n_elems; ++i) {
8395
8396 slot = m_slots[i];
8397
8398 memmove(ptr, slot->buf, slot->len);
8399
8400 ptr += slot->len;
8401 }
8402 }
8403 }
8404
8405 /** Do the I/O with ordinary, synchronous i/o functions:
8406 @param[in] len Length of buffer for IO */
io()8407 void io()
8408 {
8409 if (first_slot()->type.is_write()) {
8410
8411 for (ulint i = 0; i < m_n_elems; ++i) {
8412 write(m_slots[i]);
8413 }
8414
8415 } else {
8416
8417 for (ulint i = 0; i < m_n_elems; ++i) {
8418 read(m_slots[i]);
8419 }
8420 }
8421 }
8422
8423 /** Do the decompression of the pages read in */
io_complete()8424 void io_complete()
8425 {
8426 // Note: For non-compressed tables. Not required
8427 // for correctness.
8428 }
8429
8430 /** Mark the i/os done in slots */
done()8431 void done()
8432 {
8433 for (ulint i = 0; i < m_n_elems; ++i) {
8434 m_slots[i]->io_already_done = true;
8435 }
8436 }
8437
8438 /** @return the first slot in the consecutive array */
first_slot()8439 Slot* first_slot()
8440 MY_ATTRIBUTE((warn_unused_result))
8441 {
8442 ut_a(m_n_elems > 0);
8443
8444 return(m_slots[0]);
8445 }
8446
8447 /** Wait for I/O requests
8448 @param[in] global_segment The global segment
8449 @param[in,out] event Wait on event if no active requests
8450 @return the number of slots */
8451 ulint check_pending(
8452 ulint global_segment,
8453 os_event_t event)
8454 MY_ATTRIBUTE((warn_unused_result));
8455 private:
8456
8457 /** Do the file read
8458 @param[in,out] slot Slot that has the IO context */
read(Slot * slot)8459 void read(Slot* slot)
8460 {
8461 dberr_t err = os_file_read_func(
8462 slot->type,
8463 slot->file.m_file,
8464 slot->ptr,
8465 slot->offset,
8466 slot->len, NULL);
8467 ut_a(err == DB_SUCCESS);
8468 }
8469
8470 /** Do the file read
8471 @param[in,out] slot Slot that has the IO context */
write(Slot * slot)8472 void write(Slot* slot)
8473 {
8474 dberr_t err = os_file_write_func(
8475 slot->type,
8476 slot->name,
8477 slot->file.m_file,
8478 slot->ptr,
8479 slot->offset,
8480 slot->len);
8481 ut_a(err == DB_SUCCESS || err == DB_IO_NO_PUNCH_HOLE);
8482 }
8483
8484 /** @return true if the slots are adjacent and can be merged */
adjacent(const Slot * s1,const Slot * s2) const8485 bool adjacent(const Slot* s1, const Slot* s2) const
8486 {
8487 return(s1 != s2
8488 && s1->file.m_file == s2->file.m_file
8489 && s2->offset == s1->offset + s1->len
8490 && s1->type == s2->type);
8491 }
8492
8493 /** @return true if merge limit reached or no adjacent slots found. */
merge_adjacent(Slot * & current)8494 bool merge_adjacent(Slot*& current)
8495 {
8496 Slot* slot;
8497 ulint offset = m_segment * m_n_slots;
8498
8499 slot = m_array->at(offset);
8500
8501 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
8502
8503 if (slot->is_reserved && adjacent(current, slot)) {
8504
8505 current = slot;
8506
8507 /* Found a consecutive i/o request */
8508
8509 m_slots[m_n_elems] = slot;
8510
8511 ++m_n_elems;
8512
8513 return(m_n_elems >= m_slots.capacity());
8514 }
8515 }
8516
8517 return(true);
8518 }
8519
8520 /** There were no old requests. Look for an I/O request at the lowest
8521 offset in the array (we ignore the high 32 bits of the offset in these
8522 heuristics) */
select_lowest_offset()8523 bool select_lowest_offset()
8524 {
8525 ut_ad(m_n_elems == 0);
8526
8527 ulint offset = m_segment * m_n_slots;
8528
8529 m_lowest_offset = IB_UINT64_MAX;
8530
8531 for (ulint i = 0; i < m_n_slots; ++i) {
8532 Slot* slot;
8533
8534 slot = m_array->at(i + offset);
8535
8536 if (slot->is_reserved
8537 && slot->offset < m_lowest_offset) {
8538
8539 /* Found an i/o request */
8540 m_slots[0] = slot;
8541
8542 m_n_elems = 1;
8543
8544 m_lowest_offset = slot->offset;
8545 }
8546 }
8547
8548 return(m_n_elems > 0);
8549 }
8550
8551 /** Select the slot if it is older than the current oldest slot.
8552 @param[in] slot The slot to check */
select_if_older(Slot * slot)8553 void select_if_older(Slot* slot)
8554 {
8555 int64_t time_diff = ut_time_monotonic() -
8556 slot->reservation_time;
8557
8558 const uint64_t age = time_diff > 0 ? (uint64_t) time_diff : 0;
8559
8560 if ((age >= 2 && age > m_oldest)
8561 || (age >= 2
8562 && age == m_oldest
8563 && slot->offset < m_lowest_offset)) {
8564
8565 /* Found an i/o request */
8566 m_slots[0] = slot;
8567
8568 m_n_elems = 1;
8569
8570 m_oldest = age;
8571
8572 m_lowest_offset = slot->offset;
8573 }
8574 }
8575
8576 /** Select th oldest slot in the array
8577 @return true if oldest slot found */
select_oldest()8578 bool select_oldest()
8579 {
8580 ut_ad(m_n_elems == 0);
8581
8582 Slot* slot;
8583 ulint offset = m_n_slots * m_segment;
8584
8585 slot = m_array->at(offset);
8586
8587 for (ulint i = 0; i < m_n_slots; ++i, ++slot) {
8588
8589 if (slot->is_reserved) {
8590 select_if_older(slot);
8591 }
8592 }
8593
8594 return(m_n_elems > 0);
8595 }
8596
8597 typedef std::vector<Slot*> slots_t;
8598
8599 private:
8600 ulint m_oldest;
8601 ulint m_n_elems;
8602 os_offset_t m_lowest_offset;
8603
8604 AIO* m_array;
8605 ulint m_n_slots;
8606 ulint m_segment;
8607
8608 slots_t m_slots;
8609
8610 byte* m_ptr;
8611 byte* m_buf;
8612 };
8613
8614 /** Wait for I/O requests
8615 @return the number of slots */
8616 ulint
check_pending(ulint global_segment,os_event_t event)8617 SimulatedAIOHandler::check_pending(
8618 ulint global_segment,
8619 os_event_t event)
8620 {
8621 /* NOTE! We only access constant fields in os_aio_array.
8622 Therefore we do not have to acquire the protecting mutex yet */
8623
8624 ut_ad(os_aio_validate_skip());
8625
8626 ut_ad(m_segment < m_array->get_n_segments());
8627
8628 /* Look through n slots after the segment * n'th slot */
8629
8630 if (AIO::is_read(m_array)
8631 && os_aio_recommend_sleep_for_read_threads) {
8632
8633 /* Give other threads chance to add several
8634 I/Os to the array at once. */
8635
8636 srv_set_io_thread_op_info(
8637 global_segment, "waiting for i/o request");
8638
8639 os_event_wait(event);
8640
8641 return(0);
8642 }
8643
8644 return(m_array->slots_per_segment());
8645 }
8646
8647 /** Does simulated AIO. This function should be called by an i/o-handler
8648 thread.
8649
8650 @param[in] segment The number of the segment in the aio arrays to wait
8651 for; segment 0 is the ibuf i/o thread, segment 1 the
8652 log i/o thread, then follow the non-ibuf read threads,
8653 and as the last are the non-ibuf write threads
8654 @param[out] m1 the messages passed with the AIO request; note that
8655 also in the case where the AIO operation failed, these
8656 output parameters are valid and can be used to restart
8657 the operation, for example
8658 @param[out] m2 Callback argument
8659 @param[in] type IO context
8660 @return DB_SUCCESS or error code */
8661 static
8662 dberr_t
os_aio_simulated_handler(ulint global_segment,fil_node_t ** m1,void ** m2,IORequest * type)8663 os_aio_simulated_handler(
8664 ulint global_segment,
8665 fil_node_t** m1,
8666 void** m2,
8667 IORequest* type)
8668 {
8669 Slot* slot;
8670 AIO* array;
8671 ulint segment;
8672 os_event_t event = os_aio_segment_wait_events[global_segment];
8673
8674 segment = AIO::get_array_and_local_segment(&array, global_segment);
8675
8676 SimulatedAIOHandler handler(array, segment);
8677
8678 for (;;) {
8679
8680 srv_set_io_thread_op_info(
8681 global_segment, "looking for i/o requests (a)");
8682
8683 ulint n_slots = handler.check_pending(global_segment, event);
8684
8685 if (n_slots == 0) {
8686 continue;
8687 }
8688
8689 handler.init(n_slots);
8690
8691 srv_set_io_thread_op_info(
8692 global_segment, "looking for i/o requests (b)");
8693
8694 array->acquire();
8695
8696 ulint n_reserved;
8697
8698 slot = handler.check_completed(&n_reserved);
8699
8700 if (slot != NULL) {
8701
8702 break;
8703
8704 } else if (n_reserved == 0
8705 && !buf_page_cleaner_is_active
8706 && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
8707
8708 /* There is no completed request. If there
8709 are no pending request at all, and the system
8710 is being shut down, exit. */
8711
8712 array->release();
8713
8714 *m1 = NULL;
8715
8716 *m2 = NULL;
8717
8718 return(DB_SUCCESS);
8719
8720 } else if (handler.select()) {
8721
8722 break;
8723 }
8724
8725 /* No I/O requested at the moment */
8726
8727 srv_set_io_thread_op_info(
8728 global_segment, "resetting wait event");
8729
8730 /* We wait here until tbere are more IO requests
8731 for this segment. */
8732
8733 os_event_reset(event);
8734
8735 array->release();
8736
8737 srv_set_io_thread_op_info(
8738 global_segment, "waiting for i/o request");
8739
8740 os_event_wait(event);
8741 }
8742
8743 /** Found a slot that has already completed its IO */
8744
8745 if (slot == NULL) {
8746 /* Merge adjacent requests */
8747 handler.merge();
8748
8749 /* Check if there are several consecutive blocks
8750 to read or write */
8751
8752 srv_set_io_thread_op_info(
8753 global_segment, "consecutive i/o requests");
8754
8755 // Note: We don't support write combining for simulated AIO.
8756 //ulint total_len = handler.allocate_buffer();
8757
8758 /* We release the array mutex for the time of the I/O: NOTE that
8759 this assumes that there is just one i/o-handler thread serving
8760 a single segment of slots! */
8761
8762 array->release();
8763
8764 // Note: We don't support write combining for simulated AIO.
8765 //handler.copy_to_buffer(total_len);
8766
8767 srv_set_io_thread_op_info(global_segment, "doing file i/o");
8768
8769 handler.io();
8770
8771 srv_set_io_thread_op_info(global_segment, "file i/o done");
8772
8773 handler.io_complete();
8774
8775 array->acquire();
8776
8777 handler.done();
8778
8779 /* We return the messages for the first slot now, and if there
8780 were several slots, the messages will be returned with
8781 subsequent calls of this function */
8782
8783 slot = handler.first_slot();
8784 }
8785
8786 ut_ad(slot->is_reserved);
8787
8788 *m1 = slot->m1;
8789 *m2 = slot->m2;
8790
8791 *type = slot->type;
8792
8793 array->release(slot);
8794
8795 array->release();
8796
8797 return(DB_SUCCESS);
8798 }
8799
8800 /** Get the total number of pending IOs
8801 @return the total number of pending IOs */
8802 ulint
total_pending_io_count()8803 AIO::total_pending_io_count()
8804 {
8805 ulint count = s_reads->pending_io_count();
8806
8807 if (s_writes != NULL) {
8808 count += s_writes->pending_io_count();
8809 }
8810
8811 if (s_ibuf != NULL) {
8812 count += s_ibuf->pending_io_count();
8813 }
8814
8815 if (s_log != NULL) {
8816 count += s_log->pending_io_count();
8817 }
8818
8819 if (s_sync != NULL) {
8820 count += s_sync->pending_io_count();
8821 }
8822
8823 return(count);
8824 }
8825
8826 /** Validates the consistency the aio system.
8827 @return true if ok */
8828 static
8829 bool
os_aio_validate()8830 os_aio_validate()
8831 {
8832 /* The methods countds and validates, we ignore the count. */
8833 AIO::total_pending_io_count();
8834
8835 return(true);
8836 }
8837
8838 /** Prints pending IO requests per segment of an aio array.
8839 We probably don't need per segment statistics but they can help us
8840 during development phase to see if the IO requests are being
8841 distributed as expected.
8842 @param[in,out] file File where to print
8843 @param[in] segments Pending IO array */
8844 void
print_segment_info(FILE * file,const ulint * segments)8845 AIO::print_segment_info(
8846 FILE* file,
8847 const ulint* segments)
8848 {
8849 ut_ad(m_n_segments > 0);
8850
8851 if (m_n_segments > 1) {
8852
8853 fprintf(file, " [");
8854
8855 for (ulint i = 0; i < m_n_segments; ++i, ++segments) {
8856
8857 if (i != 0) {
8858 fprintf(file, ", ");
8859 }
8860
8861 fprintf(file, ULINTPF, *segments);
8862 }
8863
8864 fprintf(file, "] ");
8865 }
8866 }
8867
8868 /** Prints info about the aio array.
8869 @param[in,out] file Where to print */
8870 void
print(FILE * file)8871 AIO::print(FILE* file)
8872 {
8873 ulint count = 0;
8874 ulint n_res_seg[SRV_MAX_N_IO_THREADS];
8875
8876 mutex_enter(&m_mutex);
8877
8878 ut_a(!m_slots.empty());
8879 ut_a(m_n_segments > 0);
8880
8881 memset(n_res_seg, 0x0, sizeof(n_res_seg));
8882
8883 for (ulint i = 0; i < m_slots.size(); ++i) {
8884 Slot& slot = m_slots[i];
8885 ulint segment = (i * m_n_segments) / m_slots.size();
8886
8887 if (slot.is_reserved) {
8888
8889 ++count;
8890
8891 ++n_res_seg[segment];
8892
8893 ut_a(slot.len > 0);
8894 }
8895 }
8896
8897 ut_a(m_n_reserved == count);
8898
8899 print_segment_info(file, n_res_seg);
8900
8901 mutex_exit(&m_mutex);
8902 }
8903
8904 /** Print all the AIO segments
8905 @param[in,out] file Where to print */
8906 void
print_all(FILE * file)8907 AIO::print_all(FILE* file)
8908 {
8909 s_reads->print(file);
8910
8911 if (s_writes != NULL) {
8912 fputs(", aio writes:", file);
8913 s_writes->print(file);
8914 }
8915
8916 if (s_ibuf != NULL) {
8917 fputs(",\n ibuf aio reads:", file);
8918 s_ibuf->print(file);
8919 }
8920
8921 if (s_log != NULL) {
8922 fputs(", log i/o's:", file);
8923 s_log->print(file);
8924 }
8925
8926 if (s_sync != NULL) {
8927 fputs(", sync i/o's:", file);
8928 s_sync->print(file);
8929 }
8930 }
8931
8932 /** Prints info of the aio arrays.
8933 @param[in,out] file file where to print */
8934 void
os_aio_print(FILE * file)8935 os_aio_print(FILE* file)
8936 {
8937 ib_time_monotonic_t current_time;
8938 double time_elapsed;
8939 double avg_bytes_read;
8940
8941 for (ulint i = 0; i < srv_n_file_io_threads; ++i) {
8942 fprintf(file, "I/O thread %lu state: %s (%s)",
8943 (ulong) i,
8944 srv_io_thread_op_info[i],
8945 srv_io_thread_function[i]);
8946
8947 #ifndef _WIN32
8948 if (os_event_is_set(os_aio_segment_wait_events[i])) {
8949 fprintf(file, " ev set");
8950 }
8951 #endif /* _WIN32 */
8952
8953 fprintf(file, "\n");
8954 }
8955
8956 fputs("Pending normal aio reads:", file);
8957
8958 AIO::print_all(file);
8959
8960 putc('\n', file);
8961 current_time = ut_time_monotonic();
8962 time_elapsed = 0.001 + (current_time - os_last_printout);
8963
8964 fprintf(file,
8965 "Pending flushes (fsync) log: " ULINTPF "; "
8966 "buffer pool: " ULINTPF "\n"
8967 ULINTPF " OS file reads, "
8968 ULINTPF " OS file writes, "
8969 ULINTPF " OS fsyncs\n",
8970 fil_n_pending_log_flushes,
8971 fil_n_pending_tablespace_flushes,
8972 os_n_file_reads,
8973 os_n_file_writes,
8974 os_n_fsyncs);
8975
8976 if (os_n_pending_writes != 0 || os_n_pending_reads != 0) {
8977 fprintf(file,
8978 ULINTPF " pending preads, "
8979 ULINTPF " pending pwrites\n",
8980 os_n_pending_reads,
8981 os_n_pending_writes);
8982 }
8983
8984 if (os_n_file_reads == os_n_file_reads_old) {
8985 avg_bytes_read = 0.0;
8986 } else {
8987 avg_bytes_read = (double) os_bytes_read_since_printout
8988 / (os_n_file_reads - os_n_file_reads_old);
8989 }
8990
8991 fprintf(file,
8992 "%.2f reads/s, %lu avg bytes/read,"
8993 " %.2f writes/s, %.2f fsyncs/s\n",
8994 (os_n_file_reads - os_n_file_reads_old)
8995 / time_elapsed,
8996 (ulong) avg_bytes_read,
8997 (os_n_file_writes - os_n_file_writes_old)
8998 / time_elapsed,
8999 (os_n_fsyncs - os_n_fsyncs_old)
9000 / time_elapsed);
9001
9002 os_n_file_reads_old = os_n_file_reads;
9003 os_n_file_writes_old = os_n_file_writes;
9004 os_n_fsyncs_old = os_n_fsyncs;
9005 os_bytes_read_since_printout = 0;
9006
9007 os_last_printout = current_time;
9008 }
9009
9010 /** Refreshes the statistics used to print per-second averages. */
9011 void
os_aio_refresh_stats()9012 os_aio_refresh_stats()
9013 {
9014 os_n_fsyncs_old = os_n_fsyncs;
9015
9016 os_bytes_read_since_printout = 0;
9017
9018 os_n_file_reads_old = os_n_file_reads;
9019
9020 os_n_file_writes_old = os_n_file_writes;
9021
9022 os_n_fsyncs_old = os_n_fsyncs;
9023
9024 os_bytes_read_since_printout = 0;
9025
9026 os_last_printout = ut_time_monotonic();
9027 }
9028
9029 /** Checks that all slots in the system have been freed, that is, there are
9030 no pending io operations.
9031 @return true if all free */
9032 bool
os_aio_all_slots_free()9033 os_aio_all_slots_free()
9034 {
9035 return(AIO::total_pending_io_count() == 0);
9036 }
9037
9038 #ifdef UNIV_DEBUG
9039 /** Prints all pending IO for the array
9040 @param[in] file file where to print
9041 @param[in] array array to process */
9042 void
to_file(FILE * file) const9043 AIO::to_file(FILE* file) const
9044 {
9045 acquire();
9046
9047 fprintf(file, " %lu\n", static_cast<ulong>(m_n_reserved));
9048
9049 for (ulint i = 0; i < m_slots.size(); ++i) {
9050
9051 const Slot& slot = m_slots[i];
9052
9053 if (slot.is_reserved) {
9054
9055 fprintf(file,
9056 "%s IO for %s (offset=" UINT64PF
9057 ", size=%lu)\n",
9058 slot.type.is_read() ? "read" : "write",
9059 slot.name, slot.offset, slot.len);
9060 }
9061 }
9062
9063 release();
9064 }
9065
9066 /** Print pending IOs for all arrays */
9067 void
print_to_file(FILE * file)9068 AIO::print_to_file(FILE* file)
9069 {
9070 fprintf(file, "Pending normal aio reads:");
9071
9072 s_reads->to_file(file);
9073
9074 if (s_writes != NULL) {
9075 fprintf(file, "Pending normal aio writes:");
9076 s_writes->to_file(file);
9077 }
9078
9079 if (s_ibuf != NULL) {
9080 fprintf(file, "Pending ibuf aio reads:");
9081 s_ibuf->to_file(file);
9082 }
9083
9084 if (s_log != NULL) {
9085 fprintf(file, "Pending log i/o's:");
9086 s_log->to_file(file);
9087 }
9088
9089 if (s_sync != NULL) {
9090 fprintf(file, "Pending sync i/o's:");
9091 s_sync->to_file(file);
9092 }
9093 }
9094
9095 /** Prints all pending IO
9096 @param[in] file File where to print */
9097 void
os_aio_print_pending_io(FILE * file)9098 os_aio_print_pending_io(
9099 FILE* file)
9100 {
9101 AIO::print_to_file(file);
9102 }
9103
9104 #endif /* UNIV_DEBUG */
9105
9106 /**
9107 Set the file create umask
9108 @param[in] umask The umask to use for file creation. */
9109 void
os_file_set_umask(ulint umask)9110 os_file_set_umask(ulint umask)
9111 {
9112 os_innodb_umask = umask;
9113 }
9114 #else
9115
9116 #include "univ.i"
9117 #include "db0err.h"
9118 #include "mach0data.h"
9119 #include "fsp0fsp.h"
9120 #include "fil0fil.h"
9121 #include "os0file.h"
9122
9123 #ifdef UNIV_NONINL
9124 #include "os0file.ic"
9125 #endif
9126
9127 #include <lz4.h>
9128 #include <zlib.h>
9129
9130 #include <my_aes.h>
9131 #include <my_rnd.h>
9132 #include <mysqld.h>
9133 #include <mysql/service_mysql_keyring.h>
9134
9135 typedef byte Block;
9136
9137 /** Allocate a page for sync IO
9138 @return pointer to page */
9139 static
9140 Block*
os_alloc_block()9141 os_alloc_block()
9142 {
9143 return(reinterpret_cast<byte*>(malloc(UNIV_PAGE_SIZE_MAX * 2)));
9144 }
9145
9146 /** Free a page after sync IO
9147 @param[in,own] block The block to free/release */
9148 static
9149 void
os_free_block(Block * block)9150 os_free_block(Block* block)
9151 {
9152 ut_free(block);
9153 }
9154
9155 #endif /* !UNIV_INNOCHECKSUM */
9156
9157 /** Minimum length needed for encryption */
9158 const unsigned int MIN_ENCRYPTION_LEN = 2 * MY_AES_BLOCK_SIZE + FIL_PAGE_DATA;
9159
9160 /**
9161 @param[in] type The compression type
9162 @return the string representation */
9163 const char*
to_string(Type type)9164 Compression::to_string(Type type)
9165 {
9166 switch(type) {
9167 case NONE:
9168 return("None");
9169 case ZLIB:
9170 return("Zlib");
9171 case LZ4:
9172 return("LZ4");
9173 }
9174
9175 ut_ad(0);
9176
9177 return("<UNKNOWN>");
9178 }
9179
9180 /**
9181 @param[in] meta Page Meta data
9182 @return the string representation */
to_string(const Compression::meta_t & meta)9183 std::string Compression::to_string(const Compression::meta_t& meta)
9184 {
9185 std::ostringstream stream;
9186
9187 stream << "version: " << int(meta.m_version) << " "
9188 << "algorithm: " << meta.m_algorithm << " "
9189 << "(" << to_string(meta.m_algorithm) << ") "
9190 << "orginal_type: " << meta.m_original_type << " "
9191 << "original_size: " << meta.m_original_size << " "
9192 << "compressed_size: " << meta.m_compressed_size;
9193
9194 return(stream.str());
9195 }
9196
9197 /** @return true if it is a compressed page */
9198 bool
is_compressed_page(const byte * page)9199 Compression::is_compressed_page(const byte* page)
9200 {
9201 return(mach_read_from_2(page + FIL_PAGE_TYPE) == FIL_PAGE_COMPRESSED);
9202 }
9203
9204 bool
is_compressed_encrypted_page(const byte * page)9205 Compression::is_compressed_encrypted_page(const byte *page) {
9206 return (mach_read_from_2(page + FIL_PAGE_TYPE) ==
9207 FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
9208 }
9209
9210 bool
is_valid_page_version(uint8_t version)9211 Compression::is_valid_page_version(uint8_t version) {
9212 return (version == FIL_PAGE_VERSION_1 || version == FIL_PAGE_VERSION_2);
9213 }
9214
9215 /** Deserizlise the page header compression meta-data
9216 @param[in] page Pointer to the page header
9217 @param[out] control Deserialised data */
9218 void
deserialize_header(const byte * page,Compression::meta_t * control)9219 Compression::deserialize_header(
9220 const byte* page,
9221 Compression::meta_t* control)
9222 {
9223 ut_ad(is_compressed_page(page) || is_compressed_encrypted_page(page));
9224
9225 control->m_version = static_cast<uint8_t>(
9226 mach_read_from_1(page + FIL_PAGE_VERSION));
9227
9228 control->m_original_type = static_cast<uint16_t>(
9229 mach_read_from_2(page + FIL_PAGE_ORIGINAL_TYPE_V1));
9230
9231 control->m_compressed_size = static_cast<uint16_t>(
9232 mach_read_from_2(page + FIL_PAGE_COMPRESS_SIZE_V1));
9233
9234 control->m_original_size = static_cast<uint16_t>(
9235 mach_read_from_2(page + FIL_PAGE_ORIGINAL_SIZE_V1));
9236
9237 control->m_algorithm = static_cast<Type>(
9238 mach_read_from_1(page + FIL_PAGE_ALGORITHM_V1));
9239 }
9240
9241 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
9242 not then the source contents are left unchanged and DB_SUCCESS is returned.
9243 @param[in] dblwr_recover true of double write recovery in progress
9244 @param[in,out] src Data read from disk, decompressed data will be
9245 copied to this page
9246 @param[in,out] dst Scratch area to use for decompression
9247 @param[in] dst_len Size of the scratch area in bytes
9248 @return DB_SUCCESS or error code */
9249 dberr_t
deserialize(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)9250 Compression::deserialize(
9251 bool dblwr_recover,
9252 byte* src,
9253 byte* dst,
9254 ulint dst_len)
9255 {
9256 if (!is_compressed_page(src)) {
9257 /* There is nothing we can do. */
9258 return(DB_SUCCESS);
9259 }
9260
9261 meta_t header;
9262
9263 deserialize_header(src, &header);
9264
9265 byte* ptr = src + FIL_PAGE_DATA;
9266
9267 ut_ad(is_valid_page_version(header.m_version));
9268
9269 if (!is_valid_page_version(header.m_version)
9270 || header.m_original_size < UNIV_PAGE_SIZE_MIN - (FIL_PAGE_DATA + 8)
9271 || header.m_original_size > UNIV_PAGE_SIZE_MAX - FIL_PAGE_DATA
9272 || dst_len < header.m_original_size + FIL_PAGE_DATA) {
9273
9274 /* The last check could potentially return DB_OVERFLOW,
9275 the caller should be able to retry with a larger buffer. */
9276
9277 return(DB_CORRUPTION);
9278 }
9279
9280 Block* block;
9281
9282 /* The caller doesn't know what to expect */
9283 if (dst == NULL) {
9284
9285 block = os_alloc_block();
9286
9287 #ifdef UNIV_INNOCHECKSUM
9288 dst = block;
9289 #else
9290 dst = block->m_ptr;
9291 #endif /* UNIV_INNOCHECKSUM */
9292
9293 } else {
9294 block = NULL;
9295 }
9296
9297 int ret;
9298 Compression compression;
9299 ulint len = header.m_original_size;
9300
9301 compression.m_type = static_cast<Compression::Type>(header.m_algorithm);
9302
9303 switch(compression.m_type) {
9304 case Compression::ZLIB: {
9305
9306 uLongf zlen = header.m_original_size;
9307
9308 if (uncompress(dst, &zlen, ptr, header.m_compressed_size)
9309 != Z_OK) {
9310
9311 if (block != NULL) {
9312 os_free_block(block);
9313 }
9314
9315 return(DB_IO_DECOMPRESS_FAIL);
9316 }
9317
9318 len = static_cast<ulint>(zlen);
9319
9320 break;
9321 }
9322
9323 case Compression::LZ4:
9324
9325 ret = LZ4_decompress_safe(
9326 reinterpret_cast<char*>(ptr),
9327 reinterpret_cast<char*>(dst),
9328 header.m_compressed_size,
9329 header.m_original_size);
9330 if (ret < 0) {
9331
9332 if (block != NULL) {
9333 os_free_block(block);
9334 }
9335
9336 return(DB_IO_DECOMPRESS_FAIL);
9337 }
9338
9339 break;
9340
9341 default:
9342 #if !defined(UNIV_INNOCHECKSUM)
9343 ib::error()
9344 << "Compression algorithm support missing: "
9345 << Compression::to_string(compression.m_type);
9346 #else
9347 fprintf(stderr, "Compression algorithm support missing: %s\n",
9348 Compression::to_string(compression.m_type));
9349 #endif /* !UNIV_INNOCHECKSUM */
9350
9351 if (block != NULL) {
9352 os_free_block(block);
9353 }
9354
9355 return(DB_UNSUPPORTED);
9356 }
9357
9358 /* Leave the header alone */
9359 memmove(src + FIL_PAGE_DATA, dst, len);
9360
9361 mach_write_to_2(src + FIL_PAGE_TYPE, header.m_original_type);
9362
9363 ut_ad(dblwr_recover
9364 || memcmp(src + FIL_PAGE_LSN + 4,
9365 src + (header.m_original_size + FIL_PAGE_DATA)
9366 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0);
9367
9368 if (block != NULL) {
9369 os_free_block(block);
9370 }
9371
9372 return(DB_SUCCESS);
9373 }
9374
9375 /** Decompress the page data contents. Page type must be FIL_PAGE_COMPRESSED, if
9376 not then the source contents are left unchanged and DB_SUCCESS is returned.
9377 @param[in] dblwr_recover true of double write recovery in progress
9378 @param[in,out] src Data read from disk, decompressed data will be
9379 copied to this page
9380 @param[in,out] dst Scratch area to use for decompression
9381 @param[in] dst_len Size of the scratch area in bytes
9382 @return DB_SUCCESS or error code */
9383 dberr_t
os_file_decompress_page(bool dblwr_recover,byte * src,byte * dst,ulint dst_len)9384 os_file_decompress_page(
9385 bool dblwr_recover,
9386 byte* src,
9387 byte* dst,
9388 ulint dst_len)
9389 {
9390 return(Compression::deserialize(dblwr_recover, src, dst, dst_len));
9391 }
9392
9393 /**
9394 @param[in] type The encryption type
9395 @return the string representation */
9396 const char*
to_string(Type type)9397 Encryption::to_string(Type type)
9398 {
9399 switch(type) {
9400 case NONE:
9401 return("N");
9402 case AES:
9403 return("Y");
9404 case KEYRING:
9405 return("KEYRING");
9406 }
9407
9408 ut_ad(0);
9409
9410 return("<UNKNOWN>");
9411 }
9412
9413 /** Generate random encryption value for key and iv.
9414 @param[in,out] value Encryption value */
random_value(byte * value)9415 void Encryption::random_value(byte* value)
9416 {
9417 ut_ad(value != NULL);
9418
9419 my_rand_buffer(value, ENCRYPTION_KEY_LEN);
9420 }
9421
9422 void
fill_key_name(char * key_name,uint key_id)9423 Encryption::fill_key_name(char *key_name, uint key_id)
9424 {
9425 #ifndef UNIV_INNOCHECKSUM
9426 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9427
9428 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9429 "%s-%u", ENCRYPTION_PERCONA_SYSTEM_KEY_PREFIX,
9430 key_id);
9431 #endif
9432 }
9433
9434 void
fill_key_name(char * key_name,uint key_id,uint key_version)9435 Encryption::fill_key_name(char* key_name, uint key_id, uint key_version)
9436 {
9437 #ifndef UNIV_INNOCHECKSUM
9438 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9439
9440 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9441 "%s-%u:%u", ENCRYPTION_PERCONA_SYSTEM_KEY_PREFIX,
9442 key_id, key_version);
9443 #endif
9444 }
9445
9446 void
create_tablespace_key(byte ** tablespace_key,uint key_id)9447 Encryption::create_tablespace_key(byte** tablespace_key,
9448 uint key_id)
9449 {
9450 #ifndef UNIV_INNOCHECKSUM
9451 char* key_type = NULL;
9452 size_t key_len;
9453 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
9454 int ret;
9455
9456
9457 fill_key_name(key_name, key_id);
9458
9459 /* We call key ring API to generate tablespace key here. */
9460 ret = my_key_generate(key_name, "AES",
9461 NULL, ENCRYPTION_KEY_LEN);
9462
9463 if (ret) {
9464 ib::error() << "Encryption can't generate tablespace key : " << key_name;
9465 *tablespace_key = NULL;
9466 return;
9467 }
9468
9469 byte *system_tablespace_key = NULL;
9470 /* We call key ring API to get tablespace key here. */
9471 ret = my_key_fetch(key_name, &key_type, NULL,
9472 reinterpret_cast<void**>(&system_tablespace_key),
9473 &key_len);
9474
9475 if (ret || system_tablespace_key == NULL) {
9476 ib::error() << "Encryption can't find tablespace key " << key_name << " please check"
9477 " that the keyring plugin is loaded.";
9478 *tablespace_key = NULL;
9479 my_free(key_type);
9480 return;
9481 }
9482 my_free(key_type);
9483
9484 uint tablespace_key_version = 0;
9485 size_t tablespace_key_data_length = 0;
9486
9487 if (parse_system_key(system_tablespace_key, key_len, &tablespace_key_version,
9488 tablespace_key, &tablespace_key_data_length) == NULL) {
9489 my_free(system_tablespace_key);
9490 return;
9491 }
9492 my_free(system_tablespace_key);
9493 // Newly created key should have 1 assigned as its key version
9494 ut_ad(tablespace_key_version == 1 && tablespace_key_data_length == ENCRYPTION_KEY_LEN);
9495 #endif
9496 }
9497
9498
9499 /** Create new master key for key rotation.
9500 @param[in,out] master_key master key */
9501 void
create_master_key(byte ** master_key)9502 Encryption::create_master_key(byte** master_key)
9503 {
9504 #ifndef UNIV_INNOCHECKSUM
9505 char* key_type = NULL;
9506 size_t key_len;
9507 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
9508 int ret;
9509
9510 /* If uuid does not match with current server uuid,
9511 set uuid as current server uuid. */
9512 if (strcmp(uuid, server_uuid) != 0) {
9513 memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
9514 }
9515 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9516
9517 /* Generate new master key */
9518 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9519 "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
9520 uuid, master_key_id + 1);
9521
9522 /* We call key ring API to generate master key here. */
9523 ret = my_key_generate(key_name, "AES",
9524 NULL, ENCRYPTION_KEY_LEN);
9525
9526 /* We call key ring API to get master key here. */
9527 ret = my_key_fetch(key_name, &key_type, NULL,
9528 reinterpret_cast<void**>(master_key),
9529 &key_len);
9530
9531 if (ret || *master_key == NULL) {
9532 ib::error() << "Encryption can't find master key, please check"
9533 " the keyring plugin is loaded.";
9534 *master_key = NULL;
9535 } else {
9536 master_key_id++;
9537 }
9538
9539 if (key_type) {
9540 my_free(key_type);
9541 }
9542 #endif
9543 }
9544
9545 void
get_keyring_key(const char * key_name,byte ** key,size_t * key_len)9546 Encryption::get_keyring_key(const char *key_name,
9547 byte** key, size_t *key_len)
9548 {
9549 #ifndef UNIV_INNOCHECKSUM
9550 int ret;
9551 char* key_type = NULL;
9552 //size_t key_len;
9553 /* We call key ring API to get master key here. */
9554 ret = my_key_fetch(key_name, &key_type, NULL,
9555 reinterpret_cast<void**>(key), key_len);
9556
9557 if (key_type) {
9558 my_free(key_type);
9559 }
9560
9561 if (ret) {
9562 *key = NULL;
9563 }
9564 #endif
9565 }
9566
9567 bool
get_tablespace_key(uint key_id,uint tablespace_key_version,byte ** tablespace_key,size_t * key_len)9568 Encryption::get_tablespace_key(uint key_id,
9569 uint tablespace_key_version,
9570 byte** tablespace_key,
9571 size_t *key_len)
9572 {
9573 bool result = true;
9574 #ifndef UNIV_INNOCHECKSUM
9575 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
9576
9577 fill_key_name(key_name, key_id, tablespace_key_version);
9578
9579 Encryption::get_keyring_key(key_name, tablespace_key, key_len);
9580
9581 if (*tablespace_key == NULL) {
9582 ib::error() << "Encryption can't find tablespace key, please check"
9583 " the keyring plugin is loaded.";
9584 result = false;
9585 }
9586
9587 #ifdef UNIV_ENCRYPT_DEBUG
9588 if (*tablespace_key) {
9589 fprintf(stderr, "Fetched tablespace key:%s ", key_name);
9590 ut_print_buf(stderr, *tablespace_key, *key_len);
9591 fprintf(stderr, "\n");
9592 }
9593 #endif /* DEBUG_TDE */
9594 #endif
9595 return result;
9596 }
9597
9598 void
get_latest_system_key(const char * system_key_name,byte ** key,uint * key_version,size_t * key_length)9599 Encryption::get_latest_system_key(const char *system_key_name,
9600 byte **key,
9601 uint *key_version,
9602 size_t *key_length)
9603 {
9604 #ifndef UNIV_INNOCHECKSUM
9605 size_t system_key_len = 0;
9606 uchar *system_key = NULL;
9607 get_keyring_key(system_key_name, &system_key, &system_key_len);
9608 if (system_key == NULL)
9609 {
9610 *key = NULL;
9611 return;
9612 }
9613
9614 parse_system_key(system_key, system_key_len, key_version, (uchar**)key, key_length);
9615 my_free(system_key);
9616 #endif
9617 }
9618
9619 // tablespace_key_version as output parameter
9620 void
get_latest_tablespace_key(uint key_id,uint * tablespace_key_version,byte ** tablespace_key)9621 Encryption::get_latest_tablespace_key(uint key_id,
9622 uint *tablespace_key_version,
9623 byte** tablespace_key)
9624 {
9625 #ifndef UNIV_INNOCHECKSUM
9626 size_t key_len;
9627 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
9628
9629 fill_key_name(key_name, key_id);
9630
9631 get_latest_system_key(key_name, tablespace_key, tablespace_key_version, &key_len);
9632
9633 #ifdef UNIV_ENCRYPT_DEBUG
9634 if (*tablespace_key) {
9635 fprintf(stderr, "Fetched tablespace key:%s ", key_name);
9636 ut_print_buf(stderr, *tablespace_key, key_len);
9637 fprintf(stderr, "\n");
9638 }
9639 #endif /* DEBUG_TDE */
9640
9641 #endif
9642 }
9643
tablespace_key_exists(uint key_id)9644 bool Encryption::tablespace_key_exists(uint key_id)
9645 {
9646 uint tablespace_key_version;
9647 byte *tablespace_key;
9648
9649 get_latest_tablespace_key(key_id, &tablespace_key_version, &tablespace_key);
9650
9651 if(tablespace_key == NULL)
9652 return false;
9653
9654 my_free(tablespace_key);
9655 return true;
9656 }
9657
tablespace_key_exists_or_create_new_one_if_does_not_exist(uint key_id)9658 bool Encryption::tablespace_key_exists_or_create_new_one_if_does_not_exist(uint key_id)
9659 {
9660 uint tablespace_key_version;
9661 byte *tablespace_key;
9662
9663 get_latest_tablespace_key_or_create_new_one(key_id, &tablespace_key_version, &tablespace_key);
9664
9665 if (tablespace_key == NULL)
9666 return false;
9667
9668 my_free(tablespace_key);
9669 return true;
9670 }
9671
9672 void
get_latest_tablespace_key_or_create_new_one(uint key_id,uint * tablespace_key_version,byte ** tablespace_key)9673 Encryption::get_latest_tablespace_key_or_create_new_one(uint key_id,
9674 uint *tablespace_key_version,
9675 byte** tablespace_key)
9676 {
9677 get_latest_tablespace_key(key_id, tablespace_key_version, tablespace_key);
9678 if (*tablespace_key == NULL) {
9679 Encryption::create_tablespace_key(tablespace_key, key_id);
9680 *tablespace_key_version = 1;
9681 }
9682 }
9683
is_keyring_alive()9684 bool Encryption::is_keyring_alive()
9685 {
9686 return Encryption::tablespace_key_exists_or_create_new_one_if_does_not_exist(0); //DEFAULT ENCRYPTION KEY
9687 }
9688
can_page_be_keyring_encrypted(ulint page_type)9689 bool Encryption::can_page_be_keyring_encrypted(ulint page_type)
9690 {
9691 switch (page_type) {
9692 case FIL_PAGE_TYPE_FSP_HDR:
9693 case FIL_PAGE_TYPE_XDES:
9694 case FIL_PAGE_RTREE:
9695 /* File space header, extent descriptor or spatial index
9696 are not encrypted. */
9697 return false;
9698 }
9699 return true;
9700 }
9701
can_page_be_keyring_encrypted(byte * page)9702 bool Encryption::can_page_be_keyring_encrypted(byte* page)
9703 {
9704 ut_ad(page != NULL);
9705 return can_page_be_keyring_encrypted(mach_read_from_2(page+FIL_PAGE_TYPE));
9706 }
9707
9708
encryption_get_latest_version(uint key_id)9709 uint Encryption::encryption_get_latest_version(uint key_id)
9710 {
9711 #ifndef UNIV_INNOCHECKSUM
9712 uint tablespace_key_version;
9713 byte *tablespace_key;
9714
9715 get_latest_tablespace_key(key_id, &tablespace_key_version, &tablespace_key);
9716
9717 if(tablespace_key == NULL)
9718 return ENCRYPTION_KEY_VERSION_INVALID;
9719
9720 my_free(tablespace_key);
9721 return tablespace_key_version;
9722 #endif
9723 return ENCRYPTION_KEY_VERSION_INVALID;
9724 }
9725
9726 /** Get master key by key id.
9727 @param[in] master_key_id master key id
9728 @param[in] srv_uuid uuid of server instance
9729 @param[in,out] master_key master key */
9730 void
get_master_key(ulint master_key_id,char * srv_uuid,byte ** master_key)9731 Encryption::get_master_key(ulint master_key_id,
9732 char* srv_uuid,
9733 byte** master_key)
9734 {
9735 #ifndef UNIV_INNOCHECKSUM
9736 size_t key_len;
9737 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
9738
9739 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9740
9741 if (srv_uuid != NULL) {
9742 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9743 "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
9744 srv_uuid, master_key_id);
9745 } else {
9746 /* For compitable with 5.7.11, we need to get master key with
9747 server id. */
9748 memset(key_name, 0, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9749 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9750 "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
9751 server_id, master_key_id);
9752 }
9753
9754 /* We call key ring API to get master key here. */
9755 get_keyring_key(key_name, master_key, &key_len);
9756 if (*master_key == NULL) {
9757 ib::error() << "Encryption can't find master key, please check"
9758 " the keyring plugin is loaded.";
9759 }
9760
9761 #ifdef UNIV_ENCRYPT_DEBUG
9762 if (*master_key) {
9763 fprintf(stderr, "Fetched master key:%lu ", master_key_id);
9764 ut_print_buf(stderr, *master_key, key_len);
9765 fprintf(stderr, "\n");
9766 }
9767 #endif /* DEBUG_TDE */
9768
9769 #endif
9770 }
9771
9772 /** Current master key id */
9773 ulint Encryption::master_key_id = 0;
9774
9775 /** Current uuid of server instance */
9776 char Encryption::uuid[ENCRYPTION_SERVER_UUID_LEN + 1] = {0};
9777
9778 /** Get current master key and master key id
9779 @param[in,out] master_key_id master key id
9780 @param[in,out] master_key master key
9781 @param[in,out] version encryption information version */
9782 void
get_master_key(ulint * master_key_id,byte ** master_key,Encryption::Version * version)9783 Encryption::get_master_key(ulint* master_key_id,
9784 byte** master_key,
9785 Encryption::Version* version)
9786 {
9787 #ifndef UNIV_INNOCHECKSUM
9788 char* key_type = NULL;
9789 size_t key_len;
9790 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
9791 int ret;
9792
9793 memset(key_name, 0, ENCRYPTION_KEY_LEN);
9794 *version = Encryption::ENCRYPTION_VERSION_3;
9795
9796 DBUG_EXECUTE_IF("force_v2_encryption",{
9797 *version = Encryption::ENCRYPTION_VERSION_2;
9798 });
9799
9800
9801 if (Encryption::master_key_id == 0) {
9802 /* If m_master_key is 0, means there's no encrypted
9803 tablespace, we need to generate the first master key,
9804 and store it to key ring. */
9805 memset(uuid, 0, ENCRYPTION_SERVER_UUID_LEN + 1);
9806 memcpy(uuid, server_uuid, ENCRYPTION_SERVER_UUID_LEN);
9807
9808 /* Prepare the server uuid. */
9809 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9810 "%s-%s-1", ENCRYPTION_MASTER_KEY_PRIFIX,
9811 uuid);
9812
9813 /* We call key ring API to generate master key here. */
9814 ret = my_key_generate(key_name, "AES",
9815 NULL, ENCRYPTION_KEY_LEN);
9816
9817 /* We call key ring API to get master key here. */
9818 ret = my_key_fetch(key_name, &key_type, NULL,
9819 reinterpret_cast<void**>(master_key),
9820 &key_len);
9821
9822 if (!ret && *master_key != NULL) {
9823 Encryption::master_key_id++;
9824 *master_key_id = Encryption::master_key_id;
9825 }
9826 #ifdef UNIV_ENCRYPT_DEBUG
9827 if (!ret && *master_key) {
9828 fprintf(stderr, "Generated new master key:");
9829 ut_print_buf(stderr, *master_key, key_len);
9830 fprintf(stderr, "\n");
9831 }
9832 #endif
9833 } else {
9834 *master_key_id = Encryption::master_key_id;
9835
9836 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9837 "%s-%s-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
9838 uuid, *master_key_id);
9839
9840 /* We call key ring API to get master key here. */
9841 ret = my_key_fetch(key_name, &key_type, NULL,
9842 reinterpret_cast<void**>(master_key),
9843 &key_len);
9844
9845 /* For compitable with 5.7.11, we need to try to get master key with
9846 server id when get master key with server uuid failure. */
9847 if (ret || *master_key == NULL) {
9848 if (key_type) {
9849 my_free(key_type);
9850 }
9851
9852 memset(key_name, 0,
9853 ENCRYPTION_MASTER_KEY_NAME_MAX_LEN);
9854 ut_snprintf(key_name, ENCRYPTION_MASTER_KEY_NAME_MAX_LEN,
9855 "%s-%lu-%lu", ENCRYPTION_MASTER_KEY_PRIFIX,
9856 server_id, *master_key_id);
9857
9858 ret = my_key_fetch(key_name, &key_type, NULL,
9859 reinterpret_cast<void**>(master_key),
9860 &key_len);
9861 *version = Encryption::ENCRYPTION_VERSION_1;
9862 }
9863 #ifdef UNIV_ENCRYPT_DEBUG
9864 if (!ret && *master_key) {
9865 fprintf(stderr, "Fetched master key:%lu ",
9866 *master_key_id);
9867 ut_print_buf(stderr, *master_key, key_len);
9868 fprintf(stderr, "\n");
9869 }
9870 #endif
9871 }
9872
9873 if (ret) {
9874 *master_key = NULL;
9875 ib::error() << "Encryption can't find master key, please check"
9876 " the keyring plugin is loaded.";
9877 }
9878
9879 if (key_type) {
9880 my_free(key_type);
9881 }
9882 #endif
9883 }
9884
9885 #ifndef UNIV_INNOCHECKSUM
9886
9887 /** Fill the encryption information.
9888 @param[in] key encryption key
9889 @param[in] iv encryption iv
9890 @param[in,out] encrypt_info encryption information
9891 @return true if success */
fill_encryption_info(byte * key,byte * iv,byte * encrypt_info)9892 bool Encryption::fill_encryption_info(byte* key,
9893 byte* iv,
9894 byte* encrypt_info)
9895 {
9896 byte key_info[ENCRYPTION_KEY_LEN * 2];
9897
9898 /* Get master key from key ring. For bootstrap, we use a default
9899 master key which master_key_id is 0. */
9900 byte* master_key;
9901 ulint master_key_id;
9902 Version version;
9903 get_master_key(&master_key_id, &master_key, &version);
9904 if (master_key == NULL) {
9905 return(false);
9906 }
9907
9908 memset(encrypt_info, 0, ENCRYPTION_INFO_SIZE_V2);
9909 memset(key_info, 0, ENCRYPTION_KEY_LEN * 2);
9910
9911 /* Use the new master key to encrypt the key. */
9912 ut_ad(encrypt_info != NULL);
9913 byte* ptr = encrypt_info;
9914
9915 if (version == ENCRYPTION_VERSION_1) {
9916 memcpy(ptr, ENCRYPTION_KEY_MAGIC_V1, ENCRYPTION_MAGIC_SIZE);
9917 } else if (version == ENCRYPTION_VERSION_2) {
9918 memcpy(ptr, ENCRYPTION_KEY_MAGIC_V2, ENCRYPTION_MAGIC_SIZE);
9919 } else {
9920 memcpy(ptr, ENCRYPTION_KEY_MAGIC_V3, ENCRYPTION_MAGIC_SIZE);
9921 }
9922 ptr += ENCRYPTION_MAGIC_SIZE;
9923
9924 mach_write_to_4(ptr, master_key_id);
9925 if (version == ENCRYPTION_VERSION_3) {
9926 ptr += sizeof(uint32);
9927 } else {
9928 ptr += sizeof(master_key_id);
9929 }
9930
9931 if (version >= ENCRYPTION_VERSION_2) {
9932 memcpy(ptr, uuid, ENCRYPTION_SERVER_UUID_LEN);
9933 ptr += ENCRYPTION_SERVER_UUID_LEN;
9934 }
9935
9936 memcpy(key_info, key, ENCRYPTION_KEY_LEN);
9937
9938 memcpy(key_info + ENCRYPTION_KEY_LEN, iv, ENCRYPTION_KEY_LEN);
9939
9940 /* Encrypt key and iv. */
9941 const lint elen = my_aes_encrypt(key_info,
9942 ENCRYPTION_KEY_LEN * 2,
9943 ptr,
9944 master_key,
9945 ENCRYPTION_KEY_LEN,
9946 my_aes_256_ecb,
9947 NULL, false);
9948
9949 if (elen == MY_AES_BAD_DATA) {
9950 my_free(master_key);
9951 return(false);
9952 }
9953
9954 ptr += ENCRYPTION_KEY_LEN * 2;
9955
9956 /* Write checksum bytes. */
9957 ulint crc = ut_crc32(key_info, ENCRYPTION_KEY_LEN * 2);
9958 mach_write_to_4(ptr, crc);
9959
9960 my_free(master_key);
9961
9962 return(true);
9963 }
9964
9965 /** Decoding the encryption info
9966 from the first page of a tablespace.
9967 @param[in,out] key key
9968 @param[in,out] iv iv
9969 @param[in] encryption_info encrytion info.
9970 @return true if success */
9971 bool
decode_encryption_info(byte * key,byte * iv,byte * encryption_info)9972 Encryption::decode_encryption_info(byte* key,
9973 byte* iv,
9974 byte* encryption_info)
9975 {
9976 byte* master_key = NULL;
9977 byte key_info[ENCRYPTION_KEY_LEN * 2];
9978 char srv_uuid[ENCRYPTION_SERVER_UUID_LEN + 1];
9979
9980 byte* ptr = encryption_info;
9981
9982 /* For compatibility with 5.7.11, we need to handle the
9983 encryption information which created in this old version. */
9984 Version version;
9985 if (memcmp(ptr, ENCRYPTION_KEY_MAGIC_V1,
9986 ENCRYPTION_MAGIC_SIZE) == 0) {
9987 version = ENCRYPTION_VERSION_1;
9988 } else if (memcmp(ptr, ENCRYPTION_KEY_MAGIC_V2,
9989 ENCRYPTION_MAGIC_SIZE) == 0) {
9990 version = ENCRYPTION_VERSION_2;
9991 } else {
9992 version = ENCRYPTION_VERSION_3;
9993 }
9994
9995 /* Check magic. */
9996 if (version >= ENCRYPTION_VERSION_2
9997 && memcmp(ptr, ENCRYPTION_KEY_MAGIC_V2, ENCRYPTION_MAGIC_SIZE) != 0
9998 && memcmp(ptr, ENCRYPTION_KEY_MAGIC_V3, ENCRYPTION_MAGIC_SIZE) != 0) {
9999 /* We ignore report error for recovery,
10000 since the encryption info maybe hasn't writen
10001 into datafile when the table is newly created. */
10002 return recv_recovery_is_on();
10003 }
10004
10005 ptr += ENCRYPTION_MAGIC_SIZE;
10006
10007 /* Get master key id. */
10008 const ulint m_key_id = mach_read_from_4(ptr);
10009 if (version == ENCRYPTION_VERSION_3) {
10010 ptr += sizeof(uint32);
10011 } else {
10012 ptr += sizeof(ptr);
10013 }
10014
10015 /* Get server uuid. */
10016 if (version >= ENCRYPTION_VERSION_2) {
10017 memset(srv_uuid, 0, ENCRYPTION_SERVER_UUID_LEN + 1);
10018 memcpy(srv_uuid, ptr, ENCRYPTION_SERVER_UUID_LEN);
10019 ptr += ENCRYPTION_SERVER_UUID_LEN;
10020 }
10021
10022 /* Get master key by key id. */
10023 memset(key_info, 0, ENCRYPTION_KEY_LEN * 2);
10024 if (version == ENCRYPTION_VERSION_1) {
10025 get_master_key(m_key_id, NULL, &master_key);
10026 } else {
10027 if (m_key_id == 0) {
10028 /* When m_key_id is 0, which means it's the
10029 default master key for bootstrap. */
10030 master_key = static_cast<byte*>(ut_zalloc_nokey(
10031 ENCRYPTION_KEY_LEN));
10032 memcpy(master_key, ENCRYPTION_DEFAULT_MASTER_KEY,
10033 strlen(ENCRYPTION_DEFAULT_MASTER_KEY));
10034 } else {
10035 get_master_key(m_key_id, srv_uuid, &master_key);
10036 }
10037 }
10038
10039 if (master_key == NULL) {
10040 return(false);
10041 }
10042
10043 #ifdef UNIV_ENCRYPT_DEBUG
10044 fprintf(stderr, "%lu ", m_key_id);
10045 for (const byte* data = (const byte*) master_key, ulint i = 0;
10046 i < ENCRYPTION_KEY_LEN; i++)
10047 fprintf(stderr, "%02lx", (ulong)*data++);
10048 #endif
10049
10050 /* Decrypt tablespace key and iv. */
10051 const lint elen = my_aes_decrypt(
10052 ptr,
10053 ENCRYPTION_KEY_LEN * 2,
10054 key_info,
10055 master_key,
10056 ENCRYPTION_KEY_LEN,
10057 my_aes_256_ecb, NULL, false);
10058
10059 if (elen == MY_AES_BAD_DATA) {
10060 if (m_key_id == 0) {
10061 ut_free(master_key);
10062 } else {
10063 my_free(master_key);
10064 }
10065 return(NULL);
10066 }
10067
10068 /* Check checksum bytes. */
10069 ptr += ENCRYPTION_KEY_LEN * 2;
10070
10071 const ulint crc1 = mach_read_from_4(ptr);
10072 const ulint crc2 = ut_crc32(key_info, ENCRYPTION_KEY_LEN * 2);
10073 if (crc1 != crc2) {
10074 ib::error() << "Failed to decrypt encryption information,"
10075 << " please check whether key file has been changed!";
10076 if (m_key_id == 0) {
10077 ut_free(master_key);
10078 } else {
10079 my_free(master_key);
10080 }
10081 return(false);
10082 }
10083
10084 /* Get tablespace key */
10085 memcpy(key, key_info, ENCRYPTION_KEY_LEN);
10086
10087 /* Get tablespace iv */
10088 memcpy(iv, key_info + ENCRYPTION_KEY_LEN,
10089 ENCRYPTION_KEY_LEN);
10090
10091 #ifdef UNIV_ENCRYPT_DEBUG
10092 fprintf(stderr, " ");
10093 for (const byte* data = (const byte*) key,
10094 ulint i = 0; i < ENCRYPTION_KEY_LEN; i++)
10095 fprintf(stderr, "%02lx", (ulong)*data++);
10096 fprintf(stderr, " ");
10097 for (const byte* data = (const byte*) iv,
10098 ulint i = 0; i < ENCRYPTION_KEY_LEN; i++)
10099 fprintf(stderr, "%02lx", (ulong)*data++);
10100 fprintf(stderr, "\n");
10101 #endif
10102
10103 if (m_key_id == 0) {
10104 ut_free(master_key);
10105 } else {
10106 my_free(master_key);
10107 }
10108
10109 if (master_key_id < m_key_id) {
10110 master_key_id = m_key_id;
10111 memcpy(uuid, srv_uuid, ENCRYPTION_SERVER_UUID_LEN);
10112 }
10113
10114 return(true);
10115 }
10116
10117 bool
is_encrypted_and_compressed(const byte * page)10118 Encryption::is_encrypted_and_compressed(const byte *page)
10119 {
10120 ulint page_type = mach_read_from_2(page + FIL_PAGE_TYPE);
10121
10122 return page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED;
10123 }
10124
10125 /** Check if redo log block is encrypted block or not
10126 @param[in] block log block to check
10127 @return true if it is an encrypted block */
10128 bool
is_encrypted_log(const byte * block)10129 Encryption::is_encrypted_log(const byte *block) {
10130 return (log_block_get_encrypt_bit(block));
10131 }
10132
10133 /** Encrypt the redo log block.
10134 @param[in] type IORequest
10135 @param[in] src_ptr log block which need to encrypt
10136 @param[in,out] dst_ptr destination area
10137 @return true if success. */
10138 bool
encrypt_log_block(const IORequest & type,byte * src_ptr,byte * dst_ptr)10139 Encryption::encrypt_log_block(const IORequest &type, byte* src_ptr,
10140 byte* dst_ptr) {
10141 byte remain_buf[MY_AES_BLOCK_SIZE * 2];
10142
10143 #ifdef UNIV_ENCRYPT_DEBUG
10144 fprintf(stderr, "Encrypting block %lu.\n",
10145 log_block_get_hdr_no(src_ptr));
10146 ut_print_buf_hex(std::cerr, src_ptr, OS_FILE_LOG_BLOCK_SIZE);
10147 fprintf(stderr, "\n");
10148 #endif
10149 /* This is data size which need to encrypt. */
10150 const ulint unencrypted_trailer_size =
10151 (m_type == Encryption::KEYRING) ? LOG_BLOCK_TRL_SIZE : 0;
10152 const ulint data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE -
10153 unencrypted_trailer_size;
10154 const ulint main_len =
10155 (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
10156 ulint remain_len = data_len - main_len;
10157
10158 /* Encrypt the block. */
10159 /* Copy the header as is. */
10160 memmove(dst_ptr, src_ptr, LOG_BLOCK_HDR_SIZE);
10161 ut_ad(memcmp(src_ptr, dst_ptr, LOG_BLOCK_HDR_SIZE) == 0);
10162
10163 switch (m_type) {
10164 case Encryption::NONE:
10165 ut_error;
10166
10167 case Encryption::KEYRING:
10168 case Encryption::AES: {
10169 lint elen;
10170
10171 ut_ad(m_klen == ENCRYPTION_KEY_LEN);
10172
10173 elen = my_aes_encrypt(
10174 src_ptr + LOG_BLOCK_HDR_SIZE,
10175 static_cast<uint32>(main_len),
10176 dst_ptr + LOG_BLOCK_HDR_SIZE,
10177 reinterpret_cast<unsigned char *>(m_key),
10178 static_cast<uint32>(m_klen), my_aes_256_cbc,
10179 reinterpret_cast<unsigned char *>(m_iv), false);
10180
10181 if (elen == MY_AES_BAD_DATA) {
10182 return (false);
10183 }
10184
10185 const ulint len = static_cast<ulint>(elen);
10186 ut_ad(len == main_len);
10187
10188 /* Copy remaining bytes. */
10189 memcpy(dst_ptr + LOG_BLOCK_HDR_SIZE + len,
10190 src_ptr + LOG_BLOCK_HDR_SIZE + len,
10191 OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE -
10192 len);
10193
10194 /* Encrypt the remaining bytes. Since my_aes_encrypt
10195 request the content which need to encrypt is
10196 multiple of MY_AES_BLOCK_SIZE, but the block
10197 content is possiblly not, so, we need to handle
10198 the tail bytes first. */
10199 if (remain_len != 0) {
10200 remain_len = MY_AES_BLOCK_SIZE * 2;
10201
10202 elen = my_aes_encrypt(
10203 dst_ptr + LOG_BLOCK_HDR_SIZE + data_len -
10204 remain_len,
10205 static_cast<uint32>(remain_len),
10206 remain_buf,
10207 reinterpret_cast<unsigned char *>(m_key),
10208 static_cast<uint32>(m_klen),
10209 my_aes_256_cbc,
10210 reinterpret_cast<unsigned char *>(m_iv),
10211 false);
10212
10213 if (elen == MY_AES_BAD_DATA) {
10214 return (false);
10215 }
10216
10217 memcpy(dst_ptr + LOG_BLOCK_HDR_SIZE +
10218 data_len - remain_len,
10219 remain_buf, remain_len);
10220 }
10221
10222 break;
10223 }
10224
10225 default:
10226 ut_error;
10227 }
10228
10229 /* Set the encrypted flag. */
10230 log_block_set_encrypt_bit(dst_ptr, true);
10231
10232 if (m_type == Encryption::KEYRING) {
10233 const ulint crc = log_block_calc_checksum_crc32(dst_ptr);
10234 log_block_set_checksum(dst_ptr, crc + m_key_version);
10235 }
10236
10237
10238 #ifdef UNIV_ENCRYPT_DEBUG
10239 fprintf(stderr, "Encrypted block %lu.\n",
10240 log_block_get_hdr_no(dst_ptr));
10241 ut_print_buf_hex(std::cerr, dst_ptr, OS_FILE_LOG_BLOCK_SIZE);
10242 fprintf(stderr, "\n");
10243
10244 byte* check_buf =
10245 static_cast<byte *>(ut_malloc_nokey(OS_FILE_LOG_BLOCK_SIZE));
10246 byte* buf2 =
10247 static_cast<byte *>(ut_malloc_nokey(OS_FILE_LOG_BLOCK_SIZE));
10248
10249 memcpy(check_buf, dst_ptr, OS_FILE_LOG_BLOCK_SIZE);
10250 dberr_t err = decrypt_log(type, check_buf, OS_FILE_LOG_BLOCK_SIZE,
10251 buf2, OS_FILE_LOG_BLOCK_SIZE);
10252 if (err != DB_SUCCESS ||
10253 memcmp(src_ptr, check_buf, OS_FILE_LOG_BLOCK_SIZE) != 0) {
10254 ut_print_buf_hex(std::cerr, src_ptr, OS_FILE_LOG_BLOCK_SIZE);
10255 ut_print_buf_hex(std::cerr, check_buf,
10256 OS_FILE_LOG_BLOCK_SIZE);
10257 ut_ad(0);
10258 }
10259 ut_free(buf2);
10260 ut_free(check_buf);
10261 #endif
10262
10263 return (true);
10264 }
10265
10266 /** Encrypt the redo log data contents.
10267 @param[in] type IORequest
10268 @param[in] src page data which need to encrypt
10269 @param[in] src_len Size of the source in bytes
10270 @param[in,out] dst destination area
10271 @param[in,out] dst_len Size of the destination in bytes
10272 @return buffer data, dst_len will have the length of the data */
10273 byte *
encrypt_log(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint * dst_len)10274 Encryption::encrypt_log(const IORequest &type, byte* src, ulint src_len,
10275 byte* dst, ulint* dst_len) {
10276 byte* src_ptr = src;
10277 byte* dst_ptr = dst;
10278
10279 ut_ad(type.is_log());
10280 ut_ad(src_len % OS_FILE_LOG_BLOCK_SIZE == 0);
10281 ut_ad(m_type != Encryption::NONE);
10282
10283 /* Encrypt the log blocks one by one. */
10284 while (src_ptr != src + src_len) {
10285 if (!encrypt_log_block(type, src_ptr, dst_ptr)) {
10286 *dst_len = src_len;
10287 ib::error() << " Can't encrypt data of"
10288 << " redo log";
10289 return (src);
10290 }
10291
10292 src_ptr += OS_FILE_LOG_BLOCK_SIZE;
10293 dst_ptr += OS_FILE_LOG_BLOCK_SIZE;
10294 }
10295
10296 #ifdef UNIV_ENCRYPT_DEBUG
10297 byte* check_buf = static_cast<byte *>(ut_malloc_nokey(src_len));
10298 byte* buf2 = static_cast<byte *>(ut_malloc_nokey(src_len));
10299
10300 memcpy(check_buf, dst, src_len);
10301
10302 dberr_t err = decrypt_log(type, check_buf, src_len, buf2, src_len);
10303 if (err != DB_SUCCESS || memcmp(src, check_buf, src_len) != 0) {
10304 ut_print_buf_hex(std::cerr, src, src_len);
10305 ut_print_buf_hex(std::cerr, check_buf, src_len);
10306 ut_ad(0);
10307 }
10308 ut_free(buf2);
10309 ut_free(check_buf);
10310 #endif
10311
10312 return (dst);
10313 }
10314
10315
10316 #endif
10317
10318 /** Check if page is encrypted page or not
10319 @param[in] page page which need to check
10320 @return true if it is an encrypted page */
10321 bool
is_encrypted_page(const byte * page)10322 Encryption::is_encrypted_page(const byte* page)
10323 {
10324 ulint page_type = mach_read_from_2(page + FIL_PAGE_TYPE);
10325
10326 return(page_type == FIL_PAGE_ENCRYPTED
10327 || page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED
10328 || page_type == FIL_PAGE_ENCRYPTED_RTREE);
10329 }
10330
10331 /** Encrypt the page data contents. Page type can't be
10332 FIL_PAGE_ENCRYPTED, FIL_PAGE_COMPRESSED_AND_ENCRYPTED,
10333 FIL_PAGE_ENCRYPTED_RTREE.
10334 @param[in] type IORequest
10335 @param[in] src page data which need to encrypt
10336 @param[in] src_len Size of the source in bytes
10337 @param[in,out] dst destination area
10338 @param[in,out] dst_len Size of the destination in bytes
10339 @return buffer data, dst_len will have the length of the data */
10340 byte*
encrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint * dst_len)10341 Encryption::encrypt(
10342 const IORequest& type,
10343 byte* src,
10344 ulint src_len,
10345 byte* dst,
10346 ulint* dst_len)
10347 {
10348 ut_ad(m_type != Encryption::NONE);
10349 ut_ad(m_type != Encryption::KEYRING || m_key != NULL);
10350 /* For encrypting redo log, take another way. */
10351 ut_ad(!type.is_log());
10352 /* Shouldn't encrypte an already encrypted page. */
10353 ut_ad(!is_encrypted_page(src));
10354
10355 #ifdef UNIV_ENCRYPT_DEBUG
10356 ulint space_id =
10357 mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
10358 ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
10359
10360 fprintf(stderr, "Encrypting page:%lu.%lu len:%lu\n",
10361 space_id, page_no, src_len);
10362 #endif
10363
10364 const uint16_t page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
10365 // Destination header might need to acommodate key_version and checksum after encryption
10366 const uint DST_HEADER_SIZE = (m_type == Encryption::KEYRING && page_type == FIL_PAGE_COMPRESSED)
10367 ? FIL_PAGE_DATA + 8 : FIL_PAGE_DATA;
10368
10369 /* This is data size which need to encrypt. */
10370 ulint src_enc_len = src_len;
10371
10372 /* In FIL_PAGE_VERSION_2, we encrypt the actual compressed data length. */
10373 if (page_type == FIL_PAGE_COMPRESSED) {
10374 src_enc_len = mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1) +
10375 FIL_PAGE_DATA;
10376 /* Extend src_enc_len if needed */
10377 if (src_enc_len < MIN_ENCRYPTION_LEN) {
10378 src_enc_len = MIN_ENCRYPTION_LEN;
10379 }
10380 ut_a(src_enc_len <= src_len);
10381 }
10382
10383 /* Total length of the data to encrypt. */
10384 ulint data_len = 0;
10385 if (m_type == Encryption::KEYRING && page_type == FIL_PAGE_COMPRESSED) {
10386 data_len = src_enc_len - FIL_PAGE_DATA;
10387 // We need those 8 bytes for key_version and post-encryption checksum
10388 // There need to be at least 8 bytes left
10389 //ut_ad((uint)(*(src + src_len -8)) == 0);
10390 } else if (m_type == Encryption::KEYRING && !type.is_page_zip_compressed()) {
10391 // For keyring encryption we do not encrypt last four bytes which are
10392 // equal to the LSN bytes in header so they are not encrypted anyway
10393 data_len = src_enc_len - FIL_PAGE_DATA - 4;
10394 } else {
10395 data_len = src_enc_len - FIL_PAGE_DATA;
10396 }
10397
10398 /* Only encrypt the data + trailer, leave the header alone */
10399 switch (m_type) {
10400 case Encryption::NONE:
10401 ut_error;
10402
10403 case Encryption::KEYRING :
10404 //fallthrough
10405
10406 case Encryption::AES: {
10407 ut_ad(m_klen == ENCRYPTION_KEY_LEN);
10408 ut_ad(m_iv != NULL);
10409
10410 /* Server encryption functions expect input data to be in
10411 multiples of MY_AES_BLOCK SIZE. Therefore we encrypt the
10412 overlapping data of the chunk_len and trailer_len twice.
10413 First we encrypt the bigger chunk of data then we do the
10414 trailer. The trailer encryption block starts at
10415 2 * MY_AES_BLOCK_SIZE bytes offset from the end of the enc_len.
10416 During decryption we do the reverse of the above process. */
10417 ut_ad(data_len >= 2 * MY_AES_BLOCK_SIZE);
10418
10419 const ulint chunk_len = (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
10420 ulint remain_len = data_len - chunk_len;
10421
10422 lint elen = my_aes_encrypt(
10423 src + FIL_PAGE_DATA, static_cast<uint32>(chunk_len),
10424 dst + DST_HEADER_SIZE, reinterpret_cast<byte *>(m_key),
10425 static_cast<uint32>(m_klen), my_aes_256_cbc,
10426 reinterpret_cast<byte *>(m_iv), false);
10427
10428 ut_ad(elen != MY_AES_BAD_DATA);
10429
10430 if (elen == MY_AES_BAD_DATA) {
10431 ulint page_no =mach_read_from_4(
10432 src + FIL_PAGE_OFFSET);
10433 ulint space_id = mach_read_from_4(
10434 src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
10435 *dst_len = src_len;
10436 #ifndef UNIV_INNOCHECKSUM
10437 ib::error()
10438 << " Can't encrypt data of page,"
10439 << " page no:" << page_no
10440 << " space id:" << space_id;
10441 #else
10442 fprintf(stderr, " Can't encrypt data of page,"
10443 " page no:" ULINTPF
10444 " space id:" ULINTPF,
10445 page_no, space_id);
10446 #endif /* !UNIV_INNOCHECKSUM */
10447 return(src);
10448 }
10449
10450 const ulint len = static_cast<ulint>(elen);
10451 ut_ad(len == chunk_len);
10452
10453 /* Encrypt the trailing bytes. */
10454 if (remain_len != 0) {
10455 /* Copy remaining bytes and page trailer. */
10456 memcpy(dst + DST_HEADER_SIZE + len,
10457 src + FIL_PAGE_DATA + len,
10458 remain_len);
10459
10460 const ulint trailer_len = MY_AES_BLOCK_SIZE * 2;
10461 byte buf[trailer_len];
10462
10463 elen = my_aes_encrypt(
10464 dst + DST_HEADER_SIZE + data_len - trailer_len,
10465 static_cast<uint32>(trailer_len), buf,
10466 reinterpret_cast<unsigned char*>(m_key),
10467 static_cast<uint32>(m_klen), my_aes_256_cbc,
10468 reinterpret_cast<byte *>(m_iv), false);
10469
10470 ut_ad(elen != MY_AES_BAD_DATA);
10471
10472 if (elen == MY_AES_BAD_DATA) {
10473 ulint page_no =mach_read_from_4(
10474 src + FIL_PAGE_OFFSET);
10475 ulint space_id = mach_read_from_4(
10476 src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
10477 #ifndef UNIV_INNOCHECKSUM
10478 ib::error()
10479 << " Can't encrypt data of page,"
10480 << " page no:" << page_no
10481 << " space id:" << space_id;
10482 #else
10483 fprintf(stderr, " Can't encrypt data of page,"
10484 " page no:" ULINTPF
10485 " space id:" ULINTPF,
10486 page_no, space_id);
10487 #endif /* !UNIV_INNOCHECKSUM */
10488 *dst_len = src_len;
10489 return(src);
10490 }
10491
10492 ut_a(static_cast<ulint>(elen) == trailer_len);
10493
10494 memcpy(dst + DST_HEADER_SIZE + data_len - trailer_len,
10495 buf, trailer_len);
10496 }
10497
10498 break;
10499 }
10500
10501 default:
10502 ut_error;
10503 }
10504
10505 /* Copy the header as is. */
10506 memmove(dst, src, FIL_PAGE_DATA);
10507 ut_ad(memcmp(src, dst, FIL_PAGE_DATA) == 0);
10508
10509 /* Add encryption control information. Required for decrypting. */
10510 if (page_type == FIL_PAGE_COMPRESSED) {
10511 /* If the page is compressed, we don't need to save the
10512 original type, since it is done in compression already. */
10513 mach_write_to_2(dst + FIL_PAGE_TYPE,
10514 FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
10515 ut_ad(memcmp(src+FIL_PAGE_TYPE+2,
10516 dst+FIL_PAGE_TYPE+2,
10517 FIL_PAGE_DATA-FIL_PAGE_TYPE-2) == 0);
10518 } else if (page_type == FIL_PAGE_RTREE) {
10519 /* If the page is R-tree page, we need to save original type. */
10520 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED_RTREE);
10521 } else{
10522 mach_write_to_2(dst + FIL_PAGE_TYPE, FIL_PAGE_ENCRYPTED);
10523 mach_write_to_2(dst + FIL_PAGE_ORIGINAL_TYPE_V1, page_type);
10524 }
10525
10526 if (m_type == Encryption::KEYRING) {
10527 /* handle post encryption checksum */
10528 m_checksum = 0;
10529
10530 ut_ad(*dst_len == src_len);
10531
10532 if (page_type == FIL_PAGE_COMPRESSED) {
10533 memset(dst + FIL_PAGE_DATA, 0, 4); // set the checksum data to 0s before the checksum is calculated
10534 mach_write_to_4(dst + FIL_PAGE_DATA + 4, m_key_version); // Add it here so it would be included in the checksum
10535 }
10536
10537 if (type.is_page_zip_compressed())
10538 memcpy(dst + FIL_PAGE_ZIP_KEYRING_ENCRYPTION_MAGIC, ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC,
10539 ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC_LEN);
10540
10541 #ifndef UNIV_INNOCHECKSUM //TODO: Robert - this might need to be included in innodbchecksum
10542 uint page_size = *dst_len;
10543 if (page_type == FIL_PAGE_COMPRESSED) {
10544 page_size = static_cast<uint16_t>(mach_read_from_2(dst + FIL_PAGE_COMPRESS_SIZE_V1));
10545 } else if (type.is_page_zip_compressed()) {
10546 page_size = type.get_zip_page_physical_size();
10547 }
10548 m_checksum = fil_crypt_calculate_checksum(page_size, dst, type.is_page_zip_compressed());
10549 #endif
10550 ut_ad(m_key_version != 0); // Since we are encrypting key_version cannot be 0 (i.e. page unencrypted)
10551
10552
10553 if (page_type == FIL_PAGE_COMPRESSED) {
10554 mach_write_to_4(dst + FIL_PAGE_DATA, m_checksum);
10555 } else if (!type.is_page_zip_compressed()) {
10556 mach_write_to_4(dst + FIL_PAGE_ENCRYPTION_KEY_VERSION, m_key_version);
10557 ut_ad(m_checksum != 0);
10558 mach_write_to_4(dst + *dst_len - 4, m_checksum);
10559 }
10560 else if (type.is_page_zip_compressed()) {
10561 mach_write_to_4(dst + FIL_PAGE_ENCRYPTION_KEY_VERSION, m_key_version);
10562 ut_ad(m_key_version != 0);
10563 uint32 innodb_checksum = mach_read_from_4(dst + FIL_PAGE_SPACE_OR_CHKSUM);
10564 uint32 xor_checksum = innodb_checksum ^ m_checksum;
10565 mach_write_to_4(dst + FIL_PAGE_SPACE_OR_CHKSUM, xor_checksum);
10566 ut_ad(m_checksum != 0);
10567 }
10568
10569 #ifdef UNIV_ENCRYPT_DEBUG
10570 ut_ad(type.is_page_zip_compressed() ||
10571 fil_space_verify_crypt_checksum(dst, *dst_len, type.is_page_zip_compressed(), type.is_compressed())); // This works only for not zipped compressed pages
10572 #endif
10573 }
10574
10575 #ifdef UNIV_ENCRYPT_DEBUG
10576 #ifndef UNIV_INNOCHECKSUM
10577 #if 1
10578 if (m_type == Encryption::KEYRING)
10579 {
10580
10581 byte* check_buf = static_cast<byte*>(ut_malloc_nokey(src_len));
10582 byte* buf2 = static_cast<byte*>(ut_malloc_nokey(src_len));
10583
10584 memcpy(check_buf, dst, src_len);
10585
10586 fprintf(stderr, "Robert: Comparing before and after encryption");
10587
10588 byte *m_key_used = m_key;
10589
10590 if (m_type == Encryption::KEYRING) // TODO:Robert:For decryption KEYRING page key needs to be set to NULL
10591 m_key = NULL;
10592
10593 dberr_t err = decrypt(type, check_buf, src_len, buf2, src_len);
10594 if (space_id == 23 && page_no == 1)
10595 {
10596 fprintf(stderr, "Robert: After encrypting page 23:1:");
10597 ut_print_buf(stderr, dst, src_len);
10598 }
10599
10600 if (err != DB_SUCCESS || memcmp(src + FIL_PAGE_DATA,
10601 check_buf + FIL_PAGE_DATA,
10602 src_len - FIL_PAGE_DATA - 4) != 0) {
10603
10604 fprintf(stderr, "Robert: After and before encryption are different. "
10605 " key_version used for encryption: %d, key used for encryption:", m_key_version);
10606 ut_print_buf(stderr, m_key_used, 32);
10607 m_key_version= mach_read_from_4(check_buf + FIL_PAGE_ENCRYPTION_KEY_VERSION);
10608 fprintf(stderr, "Robert: After and before encryption are different. "
10609 " key_version used for decryption: %d, key used for decryption:", m_key_version);
10610
10611 size_t key_len;
10612 get_tablespace_key(m_key_id, uuid, m_key_version, &m_key, &key_len);
10613 ut_print_buf(stderr, m_key, 32);
10614
10615 ut_ad(0);
10616 }
10617 ut_free(buf2);
10618 ut_free(check_buf);
10619
10620 ut_ad(type.is_page_zip_compressed() ||
10621 fil_space_verify_crypt_checksum(dst, *dst_len, type.is_page_zip_compressed(), type.is_compressed()));
10622
10623 ut_ad(type.is_page_zip_compressed() ||
10624 fil_space_verify_crypt_checksum(dst, *dst_len, type.is_page_zip_compressed(), type.is_compressed()));
10625 }
10626 #endif
10627 fprintf(stderr, "Encrypted page:%lu.%lu\n", space_id, page_no);
10628
10629 #endif
10630 #endif
10631
10632 #ifdef UNIV_ENCRYPT_DEBUG
10633 fprintf(stderr, "Robert:Encrypted page:%lu.%lu\n", space_id, page_no);
10634 #endif
10635
10636 #if !defined(UNIV_INNOCHECKSUM)
10637 srv_stats.pages_encrypted.inc();
10638 #endif
10639
10640 /* Add padding 0 for unused portion */
10641 if (src_len > src_enc_len) {
10642 memset(dst + DST_HEADER_SIZE + data_len, 0,
10643 src_len - DST_HEADER_SIZE - data_len);
10644 }
10645
10646 *dst_len = src_len;
10647
10648 return(dst);
10649 }
10650
10651 #ifndef UNIV_INNOCHECKSUM
10652
10653 /** Decrypt the log block.
10654 @param[in] type IORequest
10655 @param[in,out] src Data read from disk, decrypted data will be
10656 copied to this page
10657 @param[in,out] dst Scratch area to use for decryption
10658 @return DB_SUCCESS or error code */
10659 dberr_t
decrypt_log_block(const IORequest & type,byte * src,byte * dst)10660 Encryption::decrypt_log_block(const IORequest &type, byte* src, byte* dst) {
10661 byte remain_buf[MY_AES_BLOCK_SIZE * 2];
10662
10663 const ulint unencrypted_trailer_size =
10664 (m_type == Encryption::KEYRING) ? LOG_BLOCK_TRL_SIZE : 0;
10665 const ulint data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE -
10666 unencrypted_trailer_size;
10667 const ulint main_len =
10668 (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
10669 ulint remain_len = data_len - main_len;
10670 byte* ptr = src + LOG_BLOCK_HDR_SIZE;
10671
10672 switch (m_type) {
10673 case Encryption::KEYRING: {
10674 const ulint block_crc =
10675 log_block_calc_checksum_crc32(src);
10676 const ulint written_crc = log_block_get_checksum(src);
10677
10678 const ulint enc_key_version = written_crc - block_crc;
10679
10680 if (m_key_version != enc_key_version &&
10681 enc_key_version != REDO_LOG_ENCRYPT_NO_VERSION) {
10682 redo_log_key* mkey =
10683 redo_log_key_mgr.load_key_version(
10684 NULL, enc_key_version);
10685 m_key_version = mkey->version;
10686 m_key = reinterpret_cast<unsigned char *>(
10687 mkey->key);
10688 }
10689 }
10690 /* FALLTHROUGH */
10691 case Encryption::AES: {
10692 lint elen;
10693
10694 /* First decrypt the last 2 blocks data of data, since
10695 data is no block aligned. */
10696 if (remain_len != 0) {
10697 ut_ad(m_klen == ENCRYPTION_KEY_LEN);
10698
10699 remain_len = MY_AES_BLOCK_SIZE * 2;
10700
10701 /* Copy the last 2 blocks. */
10702 memcpy(remain_buf,
10703 ptr + data_len - remain_len,
10704 remain_len);
10705
10706 elen = my_aes_decrypt(
10707 remain_buf,
10708 static_cast<uint32>(remain_len),
10709 dst + data_len - remain_len,
10710 reinterpret_cast<unsigned char *>(m_key),
10711 static_cast<uint32>(m_klen),
10712 my_aes_256_cbc,
10713 reinterpret_cast<unsigned char *>(m_iv),
10714 false);
10715 if (elen == MY_AES_BAD_DATA) {
10716 return (DB_IO_DECRYPT_FAIL);
10717 }
10718
10719 /* Copy the other data bytes to temp area. */
10720 memcpy(dst, ptr, data_len - remain_len);
10721 } else {
10722 ut_ad(data_len == main_len);
10723
10724 /* Copy the data bytes to temp area. */
10725 memcpy(dst, ptr, data_len);
10726 }
10727
10728 /* Then decrypt the main data */
10729 elen = my_aes_decrypt(
10730 dst, static_cast<uint32>(main_len), ptr,
10731 reinterpret_cast<unsigned char *>(m_key),
10732 static_cast<uint32>(m_klen), my_aes_256_cbc,
10733 reinterpret_cast<unsigned char *>(m_iv), false);
10734 if (elen == MY_AES_BAD_DATA) {
10735 return (DB_IO_DECRYPT_FAIL);
10736 }
10737
10738 ut_ad(static_cast<ulint>(elen) == main_len);
10739
10740 /* Copy the remaining bytes. */
10741 memcpy(ptr + main_len, dst + main_len,
10742 data_len - main_len);
10743
10744 break;
10745 }
10746
10747 default:
10748 ib::error()
10749 << "Encryption algorithm support missing: "
10750 << Encryption::to_string(m_type);
10751 return (DB_UNSUPPORTED);
10752 }
10753
10754 ptr -= LOG_BLOCK_HDR_SIZE;
10755
10756 #ifdef UNIV_ENCRYPT_DEBUG
10757 fprintf(stderr, "Decrypted block %lu.\n", log_block_get_hdr_no(ptr));
10758 ut_print_buf_hex(std::cerr, ptr, OS_FILE_LOG_BLOCK_SIZE);
10759 fprintf(stderr, "\n");
10760 #endif
10761
10762 /* Reset the encrypted flag. */
10763 log_block_set_encrypt_bit(ptr, false);
10764
10765 if (m_type == Encryption::KEYRING) {
10766 const ulint crc = log_block_calc_checksum_crc32(src);
10767 log_block_set_checksum(src, crc);
10768 }
10769
10770 return (DB_SUCCESS);
10771 }
10772
10773 /** Decrypt the log data contents.
10774 @param[in] type IORequest
10775 @param[in,out] src Data read from disk, decrypted data will be
10776 copied to this page
10777 @param[in] src_len source data length
10778 @param[in,out] dst Scratch area to use for decryption
10779 @param[in] dst_len Size of the scratch area in bytes
10780 @return DB_SUCCESS or error code */
10781 dberr_t
decrypt_log(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint dst_len)10782 Encryption::decrypt_log(const IORequest &type, byte* src, ulint src_len,
10783 byte* dst, ulint dst_len) {
10784 Block* block;
10785 byte* ptr = src;
10786 dberr_t ret;
10787
10788 ut_ad(type.is_log());
10789
10790 /* The caller doesn't know what to expect */
10791 if (dst == NULL) {
10792 block = os_alloc_block();
10793 #ifdef UNIV_INNOCHECKSUM
10794 dst = block;
10795 #else
10796 dst = block->m_ptr;
10797 #endif
10798 } else {
10799 block = NULL;
10800 }
10801
10802 /* Encrypt the log blocks one by one. */
10803 while (ptr != src + src_len) {
10804 /* If it's not an encrypted block, skip it. */
10805 if (!is_encrypted_log(ptr)) {
10806 ptr += OS_FILE_LOG_BLOCK_SIZE;
10807 continue;
10808 }
10809 #ifdef UNIV_ENCRYPT_DEBUG
10810 fprintf(stderr, "Decrypting block %lu.\n",
10811 log_block_get_hdr_no(ptr));
10812 ut_print_buf_hex(std::cerr, ptr, OS_FILE_LOG_BLOCK_SIZE);
10813 fprintf(stderr, "\n");
10814 ut_print_buf(stderr, m_key, 32);
10815 ut_print_buf(stderr, m_iv, 32);
10816 #endif
10817
10818 /* Decrypt block */
10819 ret = decrypt_log_block(type, ptr, dst);
10820 if (ret != DB_SUCCESS) {
10821 if (block != NULL) {
10822 os_free_block(block);
10823 }
10824
10825 return (ret);
10826 }
10827
10828 ptr += OS_FILE_LOG_BLOCK_SIZE;
10829 }
10830
10831 if (block != NULL) {
10832 os_free_block(block);
10833 }
10834
10835 return (DB_SUCCESS);
10836 }
10837 #endif
10838
10839 /** Decrypt the page data contents. Page type must be FIL_PAGE_ENCRYPTED,
10840 if not then the source contents are left unchanged and DB_SUCCESS is returned.
10841 @param[in] type IORequest
10842 @param[in,out] src Data read from disk, decrypted data will be
10843 copied to this page
10844 @param[in] src_len source data length
10845 @param[in,out] dst Scratch area to use for decryption
10846 @param[in] dst_len Size of the scratch area in bytes
10847 @return DB_SUCCESS or error code */
10848 dberr_t
decrypt(const IORequest & type,byte * src,ulint src_len,byte * dst,ulint dst_len)10849 Encryption::decrypt(
10850 const IORequest& type,
10851 byte* src,
10852 ulint src_len,
10853 byte* dst,
10854 ulint dst_len)
10855 {
10856 ulint data_len;
10857 ulint main_len;
10858 ulint remain_len;
10859 ulint original_type;
10860 ulint page_type;
10861 byte remain_buf[MY_AES_BLOCK_SIZE * 2];
10862 Block* block;
10863
10864 #ifndef UNIV_INNOCHECKSUM
10865 /* Do nothing if it's not an encrypted table. */
10866 if (!Encryption::is_encrypted_page(src)) {
10867 return(DB_SUCCESS);
10868 }
10869 if (m_type == Encryption::KEYRING && type.is_page_zip_compressed()) {
10870 uint32 post_enc_checksum = fil_crypt_calculate_checksum(type.get_zip_page_physical_size(), src, type.is_page_zip_compressed());
10871 uint32 xor_checksum = mach_read_from_4(src + FIL_PAGE_SPACE_OR_CHKSUM);
10872 ut_ad(xor_checksum != 0);
10873 uint32 innodb_checksum = xor_checksum ^ post_enc_checksum;
10874 mach_write_to_4(src + FIL_PAGE_SPACE_OR_CHKSUM, innodb_checksum);
10875 }
10876 #endif
10877
10878 /* For compressed page, we need to get the compressed size
10879 for decryption */
10880 page_type = mach_read_from_2(src + FIL_PAGE_TYPE);
10881 if (page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) {
10882 src_len = static_cast<uint16_t>(
10883 mach_read_from_2(src + FIL_PAGE_COMPRESS_SIZE_V1))
10884 + FIL_PAGE_DATA;
10885 #ifndef UNIV_INNOCHECKSUM
10886 Compression::meta_t header;
10887 Compression::deserialize_header(src, &header);
10888 if (header.m_version == Compression::FIL_PAGE_VERSION_1) {
10889 src_len = ut_calc_align(src_len, type.block_size());
10890 } else {
10891 /* Extend src_len if needed */
10892 if (src_len < MIN_ENCRYPTION_LEN) {
10893 src_len = MIN_ENCRYPTION_LEN;
10894 }
10895 }
10896 #endif
10897 }
10898 #ifdef UNIV_ENCRYPT_DEBUG
10899 ulint space_id =
10900 mach_read_from_4(src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
10901 ulint page_no = mach_read_from_4(src + FIL_PAGE_OFFSET);
10902
10903 fprintf(stderr, "Decrypting page:%lu.%lu len:%lu\n",
10904 space_id, page_no, src_len);
10905 #endif
10906 const uint HEADER_SIZE = (m_type == Encryption::KEYRING && page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED)
10907 ? FIL_PAGE_DATA + 8 : FIL_PAGE_DATA;
10908
10909 original_type = static_cast<uint16_t>(
10910 mach_read_from_2(src + FIL_PAGE_ORIGINAL_TYPE_V1));
10911
10912 byte* ptr = src + HEADER_SIZE;
10913
10914 /* The caller doesn't know what to expect */
10915 if (dst == NULL) {
10916
10917 block = os_alloc_block();
10918 #ifdef UNIV_INNOCHECKSUM
10919 dst = block;
10920 #else
10921 dst = block->m_ptr;
10922 #endif /* UNIV_INNOCHECKSUM */
10923
10924 } else {
10925 block = NULL;
10926 }
10927
10928 ut_ad(m_key != NULL);
10929
10930 data_len = src_len - HEADER_SIZE;
10931
10932 if (m_type == Encryption::KEYRING
10933 && page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) {
10934 // There are 8 bytes after the header used for key_version and checksum
10935 data_len += 8;
10936 } else if (page_type == FIL_PAGE_ENCRYPTED && m_type == Encryption::KEYRING
10937 && !type.is_page_zip_compressed()) {
10938 data_len -= 4; // Last 4 bytes are not encrypted
10939 }
10940
10941 main_len = (data_len / MY_AES_BLOCK_SIZE) * MY_AES_BLOCK_SIZE;
10942 remain_len = data_len - main_len;
10943
10944 switch(m_type) {
10945 case Encryption::KEYRING:
10946 case Encryption::AES: {
10947 lint elen;
10948
10949 /* First decrypt the last 2 blocks data of data, since
10950 data is no block aligned. */
10951 if (remain_len != 0) {
10952 ut_ad(m_klen == ENCRYPTION_KEY_LEN);
10953 ut_ad(m_iv != NULL);
10954
10955 remain_len = MY_AES_BLOCK_SIZE * 2;
10956
10957 /* Copy the last 2 blocks. */
10958 memcpy(remain_buf,
10959 ptr + data_len - remain_len,
10960 remain_len);
10961
10962 elen = my_aes_decrypt(
10963 remain_buf,
10964 static_cast<uint32>(remain_len),
10965 dst + data_len - remain_len,
10966 reinterpret_cast<unsigned char*>(m_key),
10967 static_cast<uint32>(m_klen),
10968 my_aes_256_cbc,
10969 reinterpret_cast<unsigned char*>(m_iv),
10970 false);
10971
10972 ut_ad(elen != MY_AES_BAD_DATA);
10973
10974 if (elen == MY_AES_BAD_DATA) {
10975 if (block != NULL) {
10976 os_free_block(block);
10977 }
10978
10979 return(DB_IO_DECRYPT_FAIL);
10980 }
10981
10982 /* Copy the other data bytes to temp area. */
10983 memcpy(dst, ptr, data_len - remain_len);
10984 } else {
10985 ut_ad(data_len == main_len);
10986
10987 /* Copy the data bytes to temp area. */
10988 memcpy(dst, ptr, data_len);
10989 }
10990
10991 if (m_type == Encryption::KEYRING && page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED) {
10992 ptr -= 8; //This much is unused as it was previously used by key version and encrypted checksum
10993 //It is not needed - overwrite this with decrypted data
10994 memset(ptr + data_len, 0, 8);
10995 }
10996
10997 /* Then decrypt the main data */
10998 elen = my_aes_decrypt(
10999 dst,
11000 static_cast<uint32>(main_len),
11001 ptr,
11002 reinterpret_cast<unsigned char*>(m_key),
11003 static_cast<uint32>(m_klen),
11004 my_aes_256_cbc,
11005 reinterpret_cast<unsigned char*>(m_iv),
11006 false);
11007 if (elen == MY_AES_BAD_DATA) {
11008
11009 if (block != NULL) {
11010 os_free_block(block);
11011 }
11012
11013 return(DB_IO_DECRYPT_FAIL);
11014 }
11015
11016 ut_ad(static_cast<ulint>(elen) == main_len);
11017
11018 /* Copy the remain bytes. */
11019 memcpy(ptr + main_len, dst + main_len, data_len - main_len);
11020
11021 break;
11022 }
11023
11024 default:
11025 if (!type.is_dblwr_recover()) {
11026 #if !defined(UNIV_INNOCHECKSUM)
11027 ib::error()
11028 << "Encryption algorithm support missing: "
11029 << Encryption::to_string(m_type);
11030 #else
11031 fprintf(stderr, "Encryption algorithm support missing: %s\n",
11032 Encryption::to_string(m_type));
11033 #endif /* !UNIV_INNOCHECKSUM */
11034 }
11035
11036 if (block != NULL) {
11037 os_free_block(block);
11038 }
11039
11040 return(DB_UNSUPPORTED);
11041 }
11042
11043 if (m_type == Encryption::KEYRING && page_type != FIL_PAGE_COMPRESSED_AND_ENCRYPTED
11044 && !type.is_page_zip_compressed()) {
11045 //restore LSN
11046 memcpy(src + src_len - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, src + FIL_PAGE_LSN + 4, 4);
11047 }
11048
11049 /* Restore the original page type. If it's a compressed and
11050 encrypted page, just reset it as compressed page type, since
11051 we will do uncompress later. */
11052 if (page_type == FIL_PAGE_ENCRYPTED) {
11053 mach_write_to_2(src + FIL_PAGE_TYPE, original_type);
11054 } else if (page_type == FIL_PAGE_ENCRYPTED_RTREE) {
11055 mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_RTREE);
11056 } else {
11057 ut_ad(page_type == FIL_PAGE_COMPRESSED_AND_ENCRYPTED);
11058 mach_write_to_2(src + FIL_PAGE_TYPE, FIL_PAGE_COMPRESSED);
11059 }
11060
11061 // mark orignal page_type as encrypted - so that when checksum check fail - we will be able
11062 // to report that if failed because decryption failed
11063 if (original_type != FIL_PAGE_TYPE_ALLOCATED && page_type != FIL_PAGE_COMPRESSED_AND_ENCRYPTED)
11064 mach_write_to_2(src + FIL_PAGE_ORIGINAL_TYPE_V1, FIL_PAGE_ENCRYPTED);
11065
11066 if (block != NULL) {
11067 os_free_block(block);
11068 }
11069
11070 if (m_type == Encryption::KEYRING && type.is_page_zip_compressed())
11071 memset(src + FIL_PAGE_ZIP_KEYRING_ENCRYPTION_MAGIC, 0, ENCRYPTION_ZIP_PAGE_KEYRING_ENCRYPTION_MAGIC_LEN);
11072 #ifdef UNIV_ENCRYPT_DEBUG
11073 fprintf(stderr, "Decrypted page:%lu.%lu\n", space_id, page_no);
11074 #endif
11075
11076 DBUG_EXECUTE_IF("ib_crash_during_decrypt_page", DBUG_SUICIDE(););
11077
11078 #if !defined(UNIV_INNOCHECKSUM)
11079 srv_stats.pages_decrypted.inc();
11080 #endif
11081
11082 return(DB_SUCCESS);
11083 }
11084
11085 #ifndef UNIV_INNOCHECKSUM
11086
11087 /** Check if keyring plugin loaded. */
check_keyring()11088 bool Encryption::check_keyring()
11089 {
11090 char key_name[ENCRYPTION_MASTER_KEY_NAME_MAX_LEN];
11091 memset(key_name, 0, ENCRYPTION_KEY_LEN);
11092 strcpy(key_name, ENCRYPTION_DEFAULT_MASTER_KEY);
11093
11094 /* We call key ring API to generate master key here. */
11095 int my_ret = my_key_generate(key_name, "AES",
11096 NULL, ENCRYPTION_KEY_LEN);
11097
11098 /* We call key ring API to get master key here. */
11099 if (my_ret != 0) {
11100 char* key_type = NULL;
11101 char* master_key = NULL;
11102 size_t key_len;
11103 my_ret = my_key_fetch(key_name, &key_type, NULL,
11104 reinterpret_cast<void**>(&master_key),
11105 &key_len);
11106
11107 my_free(key_type);
11108 my_free(master_key);
11109 }
11110
11111 if (my_ret) {
11112 ib::error() << "keyring error: please check that a"
11113 " keyring plugin is loaded.";
11114 } else {
11115 my_key_remove(key_name, NULL);
11116 return(true);
11117 }
11118
11119 return(false);
11120 }
11121
11122 /** Encrypt a doublewrite buffer page. The page is encrypted
11123 using the key of tablespace object provided.
11124 Caller should allocate buffer for encrypted page
11125 @param[in] space tablespace object
11126 @param[in] in_page unencrypted page
11127 @param[in,out] encrypted_buf buffer to hold the encrypted page
11128 @param[in] encrypted_buf_len length of the encrypted buffer
11129 @return true on success, false on failure */
11130 bool
os_dblwr_encrypt_page(fil_space_t * space,page_t * in_page,page_t * encrypted_buf,ulint encrypted_buf_len)11131 os_dblwr_encrypt_page(
11132 fil_space_t* space,
11133 page_t* in_page,
11134 page_t* encrypted_buf,
11135 ulint encrypted_buf_len)
11136 {
11137 if (!FSP_FLAGS_GET_ENCRYPTION(space->flags)) {
11138 return(false);
11139 }
11140
11141 IORequest write_request(IORequest::WRITE);
11142 page_size_t page_size(space->flags);
11143
11144 write_request.encryption_key(
11145 space->encryption_key,
11146 space->encryption_klen,
11147 false,
11148 space->encryption_iv,
11149 0, 0, NULL, NULL);
11150 write_request.encryption_algorithm(
11151 Encryption::AES);
11152
11153 ulint bytes = page_size.physical();
11154
11155 /* After successful encryption, in_page will point
11156 to a new memory block which is encrypted and
11157 the bytes will have value of length of encrypted data */
11158 void* in_page_before = in_page;
11159 Block* block = os_file_encrypt_page(
11160 write_request,
11161 in_page_before,
11162 &bytes);
11163
11164 ut_ad(block != NULL);
11165
11166 if (in_page_before == in_page) {
11167 os_free_block(block);
11168 return(false);
11169 }
11170
11171 ut_ad(bytes == page_size.physical());
11172 ut_ad(bytes <= encrypted_buf_len);
11173
11174 memcpy(encrypted_buf, in_page_before /*encrypted page*/,
11175 bytes);
11176
11177 os_free_block(block);
11178 return(true);
11179 }
11180
11181 /** Decrypt a page from doublewrite buffer. Tablespace object
11182 (fil_space_t) must have encryption key, iv set properly.
11183 The decrpyted page will be written in the same buffer of input page.
11184 @param[in] space tablespace obejct
11185 @param[in,out] page in: encrypted page
11186 out: decrypted page
11187 @return DB_SUCCESS on success, others on failure */
11188 dberr_t
os_dblwr_decrypt_page(fil_space_t * space,page_t * page)11189 os_dblwr_decrypt_page(
11190 fil_space_t* space,
11191 page_t* page)
11192 {
11193 if (!FSP_FLAGS_GET_ENCRYPTION(space->flags)) {
11194 return(DB_SUCCESS);
11195 }
11196
11197 page_size_t page_size(space->flags);
11198
11199 IORequest decrypt_request;
11200
11201 decrypt_request.encryption_key(
11202 space->encryption_key,
11203 space->encryption_klen,
11204 false,
11205 space->encryption_iv,
11206 0, 0, NULL, NULL);
11207
11208 decrypt_request.encryption_algorithm(
11209 Encryption::AES);
11210
11211 Encryption encryption(
11212 decrypt_request.encryption_algorithm());
11213
11214 dberr_t err = encryption.decrypt(
11215 decrypt_request,
11216 page, page_size.physical(), NULL,
11217 page_size.physical());
11218
11219 ut_ad(err == DB_SUCCESS);
11220 return(err);
11221 }
11222
11223 #endif
11224
11225 /** Normalizes a directory path for the current OS:
11226 On Windows, we convert '/' to '\', else we convert '\' to '/'.
11227 @param[in,out] str A null-terminated directory and file path */
11228 void
os_normalize_path(char * str)11229 os_normalize_path(
11230 char* str)
11231 {
11232 if (str != NULL) {
11233 for (; *str; str++) {
11234 if (*str == OS_PATH_SEPARATOR_ALT) {
11235 *str = OS_PATH_SEPARATOR;
11236 }
11237 }
11238 }
11239 }
11240