1 /*
2 * Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
3 * mailto:info@systemfabricworks.com
4 *
5 * License: GPLv2, see COPYING.
6 *
7 * libcufile engine
8 *
9 * fio I/O engine using the NVIDIA cuFile API.
10 *
11 */
12
13 #include <stdlib.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <string.h>
17 #include <sys/time.h>
18 #include <sys/resource.h>
19 #include <cufile.h>
20 #include <cuda.h>
21 #include <cuda_runtime.h>
22 #include <pthread.h>
23
24 #include "../fio.h"
25 #include "../lib/pow2.h"
26 #include "../optgroup.h"
27 #include "../lib/memalign.h"
28
29 #define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)
30
31 #define LOGGED_BUFLEN_NOT_ALIGNED 0x01
32 #define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
33 #define GPU_ID_SEP ":"
34
35 enum {
36 IO_CUFILE = 1,
37 IO_POSIX = 2
38 };
39
40 struct libcufile_options {
41 struct thread_data *td;
42 char *gpu_ids; /* colon-separated list of GPU ids,
43 one per job */
44 void *cu_mem_ptr; /* GPU memory */
45 void *junk_buf; /* buffer to simulate cudaMemcpy with
46 posix I/O write */
47 int my_gpu_id; /* GPU id to use for this job */
48 unsigned int cuda_io; /* Type of I/O to use with CUDA */
49 size_t total_mem; /* size for cu_mem_ptr and junk_buf */
50 int logged; /* bitmask of log messages that have
51 been output, prevent flood */
52 };
53
54 struct fio_libcufile_data {
55 CUfileDescr_t cf_descr;
56 CUfileHandle_t cf_handle;
57 };
58
59 static struct fio_option options[] = {
60 {
61 .name = "gpu_dev_ids",
62 .lname = "libcufile engine gpu dev ids",
63 .type = FIO_OPT_STR_STORE,
64 .off1 = offsetof(struct libcufile_options, gpu_ids),
65 .help = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
66 .category = FIO_OPT_C_ENGINE,
67 .group = FIO_OPT_G_LIBCUFILE,
68 },
69 {
70 .name = "cuda_io",
71 .lname = "libcufile cuda io",
72 .type = FIO_OPT_STR,
73 .off1 = offsetof(struct libcufile_options, cuda_io),
74 .help = "Type of I/O to use with CUDA",
75 .def = "cufile",
76 .posval = {
77 { .ival = "cufile",
78 .oval = IO_CUFILE,
79 .help = "libcufile nvidia-fs"
80 },
81 { .ival = "posix",
82 .oval = IO_POSIX,
83 .help = "POSIX I/O"
84 }
85 },
86 .category = FIO_OPT_C_ENGINE,
87 .group = FIO_OPT_G_LIBCUFILE,
88 },
89 {
90 .name = NULL,
91 },
92 };
93
94 static int running = 0;
95 static int cufile_initialized = 0;
96 static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;
97
98 #define check_cudaruntimecall(fn, rc) \
99 do { \
100 cudaError_t res = fn; \
101 if (res != cudaSuccess) { \
102 const char *str = cudaGetErrorName(res); \
103 log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
104 #fn, __LINE__, res, str); \
105 rc = -1; \
106 } else \
107 rc = 0; \
108 } while(0)
109
fio_libcufile_get_cuda_error(CUfileError_t st)110 static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
111 {
112 if (IS_CUFILE_ERR(st.err))
113 return cufileop_status_error(st.err);
114 return "unknown";
115 }
116
117 /*
118 * Assign GPU to subjob roundrobin, similar to how multiple
119 * entries in 'directory' are handled by fio.
120 */
fio_libcufile_find_gpu_id(struct thread_data * td)121 static int fio_libcufile_find_gpu_id(struct thread_data *td)
122 {
123 struct libcufile_options *o = td->eo;
124 int gpu_id = 0;
125
126 if (o->gpu_ids != NULL) {
127 char *gpu_ids, *pos, *cur;
128 int i, id_count, gpu_idx;
129
130 for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
131 cur = strchr(cur, GPU_ID_SEP[0]);
132 if (cur != NULL)
133 cur++;
134 }
135
136 gpu_idx = td->subjob_number % id_count;
137
138 pos = gpu_ids = strdup(o->gpu_ids);
139 if (gpu_ids == NULL) {
140 log_err("strdup(gpu_ids): err=%d\n", errno);
141 return -1;
142 }
143
144 i = 0;
145 while (pos != NULL && i <= gpu_idx) {
146 i++;
147 cur = strsep(&pos, GPU_ID_SEP);
148 }
149
150 if (cur)
151 gpu_id = atoi(cur);
152
153 free(gpu_ids);
154 }
155
156 return gpu_id;
157 }
158
fio_libcufile_init(struct thread_data * td)159 static int fio_libcufile_init(struct thread_data *td)
160 {
161 struct libcufile_options *o = td->eo;
162 CUfileError_t status;
163 int initialized;
164 int rc;
165
166 pthread_mutex_lock(&running_lock);
167 if (running == 0) {
168 assert(cufile_initialized == 0);
169 if (o->cuda_io == IO_CUFILE) {
170 /* only open the driver if this is the first worker thread */
171 status = cuFileDriverOpen();
172 if (status.err != CU_FILE_SUCCESS)
173 log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
174 fio_libcufile_get_cuda_error(status));
175 else
176 cufile_initialized = 1;
177 }
178 }
179 running++;
180 initialized = cufile_initialized;
181 pthread_mutex_unlock(&running_lock);
182
183 if (o->cuda_io == IO_CUFILE && !initialized)
184 return 1;
185
186 o->my_gpu_id = fio_libcufile_find_gpu_id(td);
187 if (o->my_gpu_id < 0)
188 return 1;
189
190 dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
191 check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
192 if (rc != 0)
193 return 1;
194
195 return 0;
196 }
197
fio_libcufile_pre_write(struct thread_data * td,struct libcufile_options * o,struct io_u * io_u,size_t gpu_offset)198 static inline int fio_libcufile_pre_write(struct thread_data *td,
199 struct libcufile_options *o,
200 struct io_u *io_u,
201 size_t gpu_offset)
202 {
203 int rc = 0;
204
205 if (o->cuda_io == IO_CUFILE) {
206 if (td->o.verify) {
207 /*
208 Data is being verified, copy the io_u buffer to GPU memory.
209 This isn't done in the non-verify case because the data would
210 already be in GPU memory in a normal cuFile application.
211 */
212 check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
213 io_u->xfer_buf,
214 io_u->xfer_buflen,
215 cudaMemcpyHostToDevice), rc);
216 if (rc != 0) {
217 log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
218 io_u->error = EIO;
219 }
220 }
221 } else if (o->cuda_io == IO_POSIX) {
222
223 /*
224 POSIX I/O is being used, the data has to be copied out of the
225 GPU into a CPU buffer. GPU memory doesn't contain the actual
226 data to write, copy the data to the junk buffer. The purpose
227 of this is to add the overhead of cudaMemcpy() that would be
228 present in a POSIX I/O CUDA application.
229 */
230 check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
231 ((char*) o->cu_mem_ptr) + gpu_offset,
232 io_u->xfer_buflen,
233 cudaMemcpyDeviceToHost), rc);
234 if (rc != 0) {
235 log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
236 io_u->error = EIO;
237 }
238 } else {
239 log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
240 assert(0);
241 rc = EINVAL;
242 }
243
244 return rc;
245 }
246
fio_libcufile_post_read(struct thread_data * td,struct libcufile_options * o,struct io_u * io_u,size_t gpu_offset)247 static inline int fio_libcufile_post_read(struct thread_data *td,
248 struct libcufile_options *o,
249 struct io_u *io_u,
250 size_t gpu_offset)
251 {
252 int rc = 0;
253
254 if (o->cuda_io == IO_CUFILE) {
255 if (td->o.verify) {
256 /* Copy GPU memory to CPU buffer for verify */
257 check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
258 ((char*) o->cu_mem_ptr) + gpu_offset,
259 io_u->xfer_buflen,
260 cudaMemcpyDeviceToHost), rc);
261 if (rc != 0) {
262 log_err("DDIR_READ cudaMemcpy D2H failed\n");
263 io_u->error = EIO;
264 }
265 }
266 } else if (o->cuda_io == IO_POSIX) {
267 /* POSIX I/O read, copy the CPU buffer to GPU memory */
268 check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
269 io_u->xfer_buf,
270 io_u->xfer_buflen,
271 cudaMemcpyHostToDevice), rc);
272 if (rc != 0) {
273 log_err("DDIR_READ cudaMemcpy H2D failed\n");
274 io_u->error = EIO;
275 }
276 } else {
277 log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
278 assert(0);
279 rc = EINVAL;
280 }
281
282 return rc;
283 }
284
fio_libcufile_queue(struct thread_data * td,struct io_u * io_u)285 static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
286 struct io_u *io_u)
287 {
288 struct libcufile_options *o = td->eo;
289 struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
290 unsigned long long io_offset;
291 ssize_t sz;
292 ssize_t remaining;
293 size_t xfered;
294 size_t gpu_offset;
295 int rc;
296
297 if (o->cuda_io == IO_CUFILE && fcd == NULL) {
298 io_u->error = EINVAL;
299 td_verror(td, EINVAL, "xfer");
300 return FIO_Q_COMPLETED;
301 }
302
303 fio_ro_check(td, io_u);
304
305 switch(io_u->ddir) {
306 case DDIR_SYNC:
307 rc = fsync(io_u->file->fd);
308 if (rc != 0) {
309 io_u->error = errno;
310 log_err("fsync: err=%d\n", errno);
311 }
312 break;
313
314 case DDIR_DATASYNC:
315 rc = fdatasync(io_u->file->fd);
316 if (rc != 0) {
317 io_u->error = errno;
318 log_err("fdatasync: err=%d\n", errno);
319 }
320 break;
321
322 case DDIR_READ:
323 case DDIR_WRITE:
324 /*
325 There may be a better way to calculate gpu_offset. The intent is
326 that gpu_offset equals the the difference between io_u->xfer_buf and
327 the page-aligned base address for io_u buffers.
328 */
329 gpu_offset = io_u->index * io_u->xfer_buflen;
330 io_offset = io_u->offset;
331 remaining = io_u->xfer_buflen;
332
333 xfered = 0;
334 sz = 0;
335
336 assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);
337
338 if (o->cuda_io == IO_CUFILE) {
339 if (!(ALIGNED_4KB(io_u->xfer_buflen) ||
340 (o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
341 log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
342 o->logged |= LOGGED_BUFLEN_NOT_ALIGNED;
343 }
344
345 if (!(ALIGNED_4KB(gpu_offset) ||
346 (o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
347 log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
348 o->logged |= LOGGED_GPU_OFFSET_NOT_ALIGNED;
349 }
350 }
351
352 if (io_u->ddir == DDIR_WRITE)
353 rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);
354
355 if (io_u->error != 0)
356 break;
357
358 while (remaining > 0) {
359 assert(gpu_offset + xfered <= o->total_mem);
360 if (io_u->ddir == DDIR_READ) {
361 if (o->cuda_io == IO_CUFILE) {
362 sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
363 io_offset + xfered, gpu_offset + xfered);
364 if (sz == -1) {
365 io_u->error = errno;
366 log_err("cuFileRead: err=%d\n", errno);
367 } else if (sz < 0) {
368 io_u->error = EIO;
369 log_err("cuFileRead: err=%ld:%s\n", sz,
370 cufileop_status_error(-sz));
371 }
372 } else if (o->cuda_io == IO_POSIX) {
373 sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
374 remaining, io_offset + xfered);
375 if (sz < 0) {
376 io_u->error = errno;
377 log_err("pread: err=%d\n", errno);
378 }
379 } else {
380 log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
381 io_u->error = -1;
382 assert(0);
383 }
384 } else if (io_u->ddir == DDIR_WRITE) {
385 if (o->cuda_io == IO_CUFILE) {
386 sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
387 io_offset + xfered, gpu_offset + xfered);
388 if (sz == -1) {
389 io_u->error = errno;
390 log_err("cuFileWrite: err=%d\n", errno);
391 } else if (sz < 0) {
392 io_u->error = EIO;
393 log_err("cuFileWrite: err=%ld:%s\n", sz,
394 cufileop_status_error(-sz));
395 }
396 } else if (o->cuda_io == IO_POSIX) {
397 sz = pwrite(io_u->file->fd,
398 ((char*) io_u->xfer_buf) + xfered,
399 remaining, io_offset + xfered);
400 if (sz < 0) {
401 io_u->error = errno;
402 log_err("pwrite: err=%d\n", errno);
403 }
404 } else {
405 log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
406 io_u->error = -1;
407 assert(0);
408 }
409 } else {
410 log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
411 io_u->error = -1;
412 assert(0);
413 break;
414 }
415
416 if (io_u->error != 0)
417 break;
418
419 remaining -= sz;
420 xfered += sz;
421
422 if (remaining != 0)
423 log_info("Incomplete %s: %ld bytes remaining\n",
424 io_u->ddir == DDIR_READ? "read" : "write", remaining);
425 }
426
427 if (io_u->error != 0)
428 break;
429
430 if (io_u->ddir == DDIR_READ)
431 rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
432 break;
433
434 default:
435 io_u->error = EINVAL;
436 break;
437 }
438
439 if (io_u->error != 0) {
440 log_err("IO failed\n");
441 td_verror(td, io_u->error, "xfer");
442 }
443
444 return FIO_Q_COMPLETED;
445 }
446
fio_libcufile_open_file(struct thread_data * td,struct fio_file * f)447 static int fio_libcufile_open_file(struct thread_data *td, struct fio_file *f)
448 {
449 struct libcufile_options *o = td->eo;
450 struct fio_libcufile_data *fcd = NULL;
451 int rc;
452 CUfileError_t status;
453
454 rc = generic_open_file(td, f);
455 if (rc)
456 return rc;
457
458 if (o->cuda_io == IO_CUFILE) {
459 fcd = calloc(1, sizeof(*fcd));
460 if (fcd == NULL) {
461 rc = ENOMEM;
462 goto exit_err;
463 }
464
465 fcd->cf_descr.handle.fd = f->fd;
466 fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
467 status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
468 if (status.err != CU_FILE_SUCCESS) {
469 log_err("cufile register: err=%d:%s\n", status.err,
470 fio_libcufile_get_cuda_error(status));
471 rc = EINVAL;
472 goto exit_err;
473 }
474 }
475
476 FILE_SET_ENG_DATA(f, fcd);
477 return 0;
478
479 exit_err:
480 if (fcd) {
481 free(fcd);
482 fcd = NULL;
483 }
484 if (f) {
485 int rc2 = generic_close_file(td, f);
486 if (rc2)
487 log_err("generic_close_file: err=%d\n", rc2);
488 }
489 return rc;
490 }
491
fio_libcufile_close_file(struct thread_data * td,struct fio_file * f)492 static int fio_libcufile_close_file(struct thread_data *td, struct fio_file *f)
493 {
494 struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
495 int rc;
496
497 if (fcd != NULL) {
498 cuFileHandleDeregister(fcd->cf_handle);
499 FILE_SET_ENG_DATA(f, NULL);
500 free(fcd);
501 }
502
503 rc = generic_close_file(td, f);
504
505 return rc;
506 }
507
fio_libcufile_iomem_alloc(struct thread_data * td,size_t total_mem)508 static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
509 {
510 struct libcufile_options *o = td->eo;
511 int rc;
512 CUfileError_t status;
513
514 o->total_mem = total_mem;
515 o->logged = 0;
516 o->cu_mem_ptr = NULL;
517 o->junk_buf = NULL;
518 td->orig_buffer = calloc(1, total_mem);
519 if (!td->orig_buffer) {
520 log_err("orig_buffer calloc failed: err=%d\n", errno);
521 goto exit_error;
522 }
523
524 if (o->cuda_io == IO_POSIX) {
525 o->junk_buf = calloc(1, total_mem);
526 if (o->junk_buf == NULL) {
527 log_err("junk_buf calloc failed: err=%d\n", errno);
528 goto exit_error;
529 }
530 }
531
532 dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
533 check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
534 if (rc != 0)
535 goto exit_error;
536 check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
537 if (rc != 0)
538 goto exit_error;
539
540 if (o->cuda_io == IO_CUFILE) {
541 status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
542 if (status.err != CU_FILE_SUCCESS) {
543 log_err("cuFileBufRegister: err=%d:%s\n", status.err,
544 fio_libcufile_get_cuda_error(status));
545 goto exit_error;
546 }
547 }
548
549 return 0;
550
551 exit_error:
552 if (td->orig_buffer) {
553 free(td->orig_buffer);
554 td->orig_buffer = NULL;
555 }
556 if (o->junk_buf) {
557 free(o->junk_buf);
558 o->junk_buf = NULL;
559 }
560 if (o->cu_mem_ptr) {
561 cudaFree(o->cu_mem_ptr);
562 o->cu_mem_ptr = NULL;
563 }
564 return 1;
565 }
566
fio_libcufile_iomem_free(struct thread_data * td)567 static void fio_libcufile_iomem_free(struct thread_data *td)
568 {
569 struct libcufile_options *o = td->eo;
570
571 if (o->junk_buf) {
572 free(o->junk_buf);
573 o->junk_buf = NULL;
574 }
575 if (o->cu_mem_ptr) {
576 if (o->cuda_io == IO_CUFILE)
577 cuFileBufDeregister(o->cu_mem_ptr);
578 cudaFree(o->cu_mem_ptr);
579 o->cu_mem_ptr = NULL;
580 }
581 if (td->orig_buffer) {
582 free(td->orig_buffer);
583 td->orig_buffer = NULL;
584 }
585 }
586
fio_libcufile_cleanup(struct thread_data * td)587 static void fio_libcufile_cleanup(struct thread_data *td)
588 {
589 struct libcufile_options *o = td->eo;
590
591 pthread_mutex_lock(&running_lock);
592 running--;
593 assert(running >= 0);
594 if (running == 0) {
595 /* only close the driver if initialized and
596 this is the last worker thread */
597 if (o->cuda_io == IO_CUFILE && cufile_initialized)
598 cuFileDriverClose();
599 cufile_initialized = 0;
600 }
601 pthread_mutex_unlock(&running_lock);
602 }
603
604 FIO_STATIC struct ioengine_ops ioengine = {
605 .name = "libcufile",
606 .version = FIO_IOOPS_VERSION,
607 .init = fio_libcufile_init,
608 .queue = fio_libcufile_queue,
609 .open_file = fio_libcufile_open_file,
610 .close_file = fio_libcufile_close_file,
611 .iomem_alloc = fio_libcufile_iomem_alloc,
612 .iomem_free = fio_libcufile_iomem_free,
613 .cleanup = fio_libcufile_cleanup,
614 .flags = FIO_SYNCIO,
615 .options = options,
616 .option_struct_size = sizeof(struct libcufile_options)
617 };
618
fio_libcufile_register(void)619 void fio_init fio_libcufile_register(void)
620 {
621 register_ioengine(&ioengine);
622 }
623
fio_libcufile_unregister(void)624 void fio_exit fio_libcufile_unregister(void)
625 {
626 unregister_ioengine(&ioengine);
627 }
628