1 /*
2  * Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
3  * mailto:info@systemfabricworks.com
4  *
5  * License: GPLv2, see COPYING.
6  *
7  * libcufile engine
8  *
9  * fio I/O engine using the NVIDIA cuFile API.
10  *
11  */
12 
13 #include <stdlib.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <string.h>
17 #include <sys/time.h>
18 #include <sys/resource.h>
19 #include <cufile.h>
20 #include <cuda.h>
21 #include <cuda_runtime.h>
22 #include <pthread.h>
23 
24 #include "../fio.h"
25 #include "../lib/pow2.h"
26 #include "../optgroup.h"
27 #include "../lib/memalign.h"
28 
29 #define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)
30 
31 #define LOGGED_BUFLEN_NOT_ALIGNED     0x01
32 #define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
33 #define GPU_ID_SEP ":"
34 
35 enum {
36 	IO_CUFILE    = 1,
37 	IO_POSIX     = 2
38 };
39 
40 struct libcufile_options {
41 	struct thread_data *td;
42 	char               *gpu_ids;       /* colon-separated list of GPU ids,
43 					      one per job */
44 	void               *cu_mem_ptr;    /* GPU memory */
45 	void               *junk_buf;      /* buffer to simulate cudaMemcpy with
46 					      posix I/O write */
47 	int                 my_gpu_id;     /* GPU id to use for this job */
48 	unsigned int        cuda_io;       /* Type of I/O to use with CUDA */
49 	size_t              total_mem;     /* size for cu_mem_ptr and junk_buf */
50 	int                 logged;        /* bitmask of log messages that have
51 					      been output, prevent flood */
52 };
53 
54 struct fio_libcufile_data {
55 	CUfileDescr_t  cf_descr;
56 	CUfileHandle_t cf_handle;
57 };
58 
59 static struct fio_option options[] = {
60 	{
61 		.name	  = "gpu_dev_ids",
62 		.lname	  = "libcufile engine gpu dev ids",
63 		.type	  = FIO_OPT_STR_STORE,
64 		.off1	  = offsetof(struct libcufile_options, gpu_ids),
65 		.help	  = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
66 		.category = FIO_OPT_C_ENGINE,
67 		.group	  = FIO_OPT_G_LIBCUFILE,
68 	},
69 	{
70 		.name	  = "cuda_io",
71 		.lname	  = "libcufile cuda io",
72 		.type	  = FIO_OPT_STR,
73 		.off1	  = offsetof(struct libcufile_options, cuda_io),
74 		.help	  = "Type of I/O to use with CUDA",
75 		.def      = "cufile",
76 		.posval   = {
77 			    { .ival = "cufile",
78 			      .oval = IO_CUFILE,
79 			      .help = "libcufile nvidia-fs"
80 			    },
81 			    { .ival = "posix",
82 			      .oval = IO_POSIX,
83 			      .help = "POSIX I/O"
84 			    }
85 		},
86 		.category = FIO_OPT_C_ENGINE,
87 		.group	  = FIO_OPT_G_LIBCUFILE,
88 	},
89 	{
90 		.name	 = NULL,
91 	},
92 };
93 
94 static int running = 0;
95 static int cufile_initialized = 0;
96 static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;
97 
98 #define check_cudaruntimecall(fn, rc)                                               \
99 	do {                                                                        \
100 		cudaError_t res = fn;                                               \
101 		if (res != cudaSuccess) {                                           \
102 			const char *str = cudaGetErrorName(res);                    \
103 			log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
104 				#fn, __LINE__, res, str);                           \
105 			rc = -1;                                                    \
106 		} else                                                              \
107 			rc = 0;                                                     \
108 	} while(0)
109 
fio_libcufile_get_cuda_error(CUfileError_t st)110 static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
111 {
112 	if (IS_CUFILE_ERR(st.err))
113 		return cufileop_status_error(st.err);
114 	return "unknown";
115 }
116 
117 /*
118  * Assign GPU to subjob roundrobin, similar to how multiple
119  * entries in 'directory' are handled by fio.
120  */
fio_libcufile_find_gpu_id(struct thread_data * td)121 static int fio_libcufile_find_gpu_id(struct thread_data *td)
122 {
123 	struct libcufile_options *o = td->eo;
124 	int gpu_id = 0;
125 
126 	if (o->gpu_ids != NULL) {
127 		char *gpu_ids, *pos, *cur;
128 		int i, id_count, gpu_idx;
129 
130 		for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
131 			cur = strchr(cur, GPU_ID_SEP[0]);
132 			if (cur != NULL)
133 				cur++;
134 		}
135 
136 		gpu_idx = td->subjob_number % id_count;
137 
138 		pos = gpu_ids = strdup(o->gpu_ids);
139 		if (gpu_ids == NULL) {
140 			log_err("strdup(gpu_ids): err=%d\n", errno);
141 			return -1;
142 		}
143 
144 		i = 0;
145 		while (pos != NULL && i <= gpu_idx) {
146 			i++;
147 			cur = strsep(&pos, GPU_ID_SEP);
148 		}
149 
150 		if (cur)
151 			gpu_id = atoi(cur);
152 
153 		free(gpu_ids);
154 	}
155 
156 	return gpu_id;
157 }
158 
fio_libcufile_init(struct thread_data * td)159 static int fio_libcufile_init(struct thread_data *td)
160 {
161 	struct libcufile_options *o = td->eo;
162 	CUfileError_t status;
163 	int initialized;
164 	int rc;
165 
166 	pthread_mutex_lock(&running_lock);
167 	if (running == 0) {
168 		assert(cufile_initialized == 0);
169 		if (o->cuda_io == IO_CUFILE) {
170 			/* only open the driver if this is the first worker thread */
171 			status = cuFileDriverOpen();
172 			if (status.err != CU_FILE_SUCCESS)
173 				log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
174 					fio_libcufile_get_cuda_error(status));
175 			else
176 				cufile_initialized = 1;
177 		}
178 	}
179 	running++;
180 	initialized = cufile_initialized;
181 	pthread_mutex_unlock(&running_lock);
182 
183 	if (o->cuda_io == IO_CUFILE && !initialized)
184 		return 1;
185 
186 	o->my_gpu_id = fio_libcufile_find_gpu_id(td);
187 	if (o->my_gpu_id < 0)
188 		return 1;
189 
190 	dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
191 	check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
192 	if (rc != 0)
193 		return 1;
194 
195 	return 0;
196 }
197 
fio_libcufile_pre_write(struct thread_data * td,struct libcufile_options * o,struct io_u * io_u,size_t gpu_offset)198 static inline int fio_libcufile_pre_write(struct thread_data *td,
199 					  struct libcufile_options *o,
200 					  struct io_u *io_u,
201 					  size_t gpu_offset)
202 {
203 	int rc = 0;
204 
205 	if (o->cuda_io == IO_CUFILE) {
206 		if (td->o.verify) {
207 			/*
208 			  Data is being verified, copy the io_u buffer to GPU memory.
209 			  This isn't done in the non-verify case because the data would
210 			  already be in GPU memory in a normal cuFile application.
211 			*/
212 			check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
213 							 io_u->xfer_buf,
214 							 io_u->xfer_buflen,
215 							 cudaMemcpyHostToDevice), rc);
216 			if (rc != 0) {
217 				log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
218 				io_u->error = EIO;
219 			}
220 		}
221 	} else if (o->cuda_io == IO_POSIX) {
222 
223 		/*
224 		  POSIX I/O is being used, the data has to be copied out of the
225 		  GPU into a CPU buffer. GPU memory doesn't contain the actual
226 		  data to write, copy the data to the junk buffer. The purpose
227 		  of this is to add the overhead of cudaMemcpy() that would be
228 		  present in a POSIX I/O CUDA application.
229 		*/
230 		check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
231 						 ((char*) o->cu_mem_ptr) + gpu_offset,
232 						 io_u->xfer_buflen,
233 						 cudaMemcpyDeviceToHost), rc);
234 		if (rc != 0) {
235 			log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
236 			io_u->error = EIO;
237 		}
238 	} else {
239 		log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
240 		assert(0);
241 		rc = EINVAL;
242 	}
243 
244 	return rc;
245 }
246 
fio_libcufile_post_read(struct thread_data * td,struct libcufile_options * o,struct io_u * io_u,size_t gpu_offset)247 static inline int fio_libcufile_post_read(struct thread_data *td,
248 					  struct libcufile_options *o,
249 					  struct io_u *io_u,
250 					  size_t gpu_offset)
251 {
252 	int rc = 0;
253 
254 	if (o->cuda_io == IO_CUFILE) {
255 		if (td->o.verify) {
256 			/* Copy GPU memory to CPU buffer for verify */
257 			check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
258 							 ((char*) o->cu_mem_ptr) + gpu_offset,
259 							 io_u->xfer_buflen,
260 							 cudaMemcpyDeviceToHost), rc);
261 			if (rc != 0) {
262 				log_err("DDIR_READ cudaMemcpy D2H failed\n");
263 				io_u->error = EIO;
264 			}
265 		}
266 	} else if (o->cuda_io == IO_POSIX) {
267 		/* POSIX I/O read, copy the CPU buffer to GPU memory */
268 		check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
269 						 io_u->xfer_buf,
270 						 io_u->xfer_buflen,
271 						 cudaMemcpyHostToDevice), rc);
272 		if (rc != 0) {
273 			log_err("DDIR_READ cudaMemcpy H2D failed\n");
274 			io_u->error = EIO;
275 		}
276 	} else {
277 		log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
278 		assert(0);
279 		rc = EINVAL;
280 	}
281 
282 	return rc;
283 }
284 
fio_libcufile_queue(struct thread_data * td,struct io_u * io_u)285 static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
286 					     struct io_u *io_u)
287 {
288 	struct libcufile_options *o = td->eo;
289 	struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
290 	unsigned long long io_offset;
291 	ssize_t sz;
292 	ssize_t remaining;
293 	size_t xfered;
294 	size_t gpu_offset;
295 	int rc;
296 
297 	if (o->cuda_io == IO_CUFILE && fcd == NULL) {
298 		io_u->error = EINVAL;
299 		td_verror(td, EINVAL, "xfer");
300 		return FIO_Q_COMPLETED;
301 	}
302 
303 	fio_ro_check(td, io_u);
304 
305 	switch(io_u->ddir) {
306 	case DDIR_SYNC:
307 		rc = fsync(io_u->file->fd);
308 		if (rc != 0) {
309 			io_u->error = errno;
310 			log_err("fsync: err=%d\n", errno);
311 		}
312 		break;
313 
314 	case DDIR_DATASYNC:
315 		rc = fdatasync(io_u->file->fd);
316 		if (rc != 0) {
317 			io_u->error = errno;
318 			log_err("fdatasync: err=%d\n", errno);
319 		}
320 		break;
321 
322 	case DDIR_READ:
323 	case DDIR_WRITE:
324 		/*
325 		  There may be a better way to calculate gpu_offset. The intent is
326 		  that gpu_offset equals the the difference between io_u->xfer_buf and
327 		  the page-aligned base address for io_u buffers.
328 		*/
329 		gpu_offset = io_u->index * io_u->xfer_buflen;
330 		io_offset = io_u->offset;
331 		remaining = io_u->xfer_buflen;
332 
333 		xfered = 0;
334 		sz = 0;
335 
336 		assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);
337 
338 		if (o->cuda_io == IO_CUFILE) {
339 			if (!(ALIGNED_4KB(io_u->xfer_buflen) ||
340 			      (o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
341 				log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
342 				o->logged |= LOGGED_BUFLEN_NOT_ALIGNED;
343 			}
344 
345 			if (!(ALIGNED_4KB(gpu_offset) ||
346 			      (o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
347 				log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
348 				o->logged |= LOGGED_GPU_OFFSET_NOT_ALIGNED;
349 			}
350 		}
351 
352 		if (io_u->ddir == DDIR_WRITE)
353 			rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);
354 
355 		if (io_u->error != 0)
356 			break;
357 
358 		while (remaining > 0) {
359 			assert(gpu_offset + xfered <= o->total_mem);
360 			if (io_u->ddir == DDIR_READ) {
361 				if (o->cuda_io == IO_CUFILE) {
362 					sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
363 							io_offset + xfered, gpu_offset + xfered);
364 					if (sz == -1) {
365 						io_u->error = errno;
366 						log_err("cuFileRead: err=%d\n", errno);
367 					} else if (sz < 0) {
368 						io_u->error = EIO;
369 						log_err("cuFileRead: err=%ld:%s\n", sz,
370 							cufileop_status_error(-sz));
371 					}
372 				} else if (o->cuda_io == IO_POSIX) {
373 					sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
374 						   remaining, io_offset + xfered);
375 					if (sz < 0) {
376 						io_u->error = errno;
377 						log_err("pread: err=%d\n", errno);
378 					}
379 				} else {
380 					log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
381 					io_u->error = -1;
382 					assert(0);
383 				}
384 			} else if (io_u->ddir == DDIR_WRITE) {
385 				if (o->cuda_io == IO_CUFILE) {
386 					sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
387 							 io_offset + xfered, gpu_offset + xfered);
388 					if (sz == -1) {
389 						io_u->error = errno;
390 						log_err("cuFileWrite: err=%d\n", errno);
391 					} else if (sz < 0) {
392 						io_u->error = EIO;
393 						log_err("cuFileWrite: err=%ld:%s\n", sz,
394 							cufileop_status_error(-sz));
395 					}
396 				} else if (o->cuda_io == IO_POSIX) {
397 					sz = pwrite(io_u->file->fd,
398 						    ((char*) io_u->xfer_buf) + xfered,
399 						    remaining, io_offset + xfered);
400 					if (sz < 0) {
401 						io_u->error = errno;
402 						log_err("pwrite: err=%d\n", errno);
403 					}
404 				} else {
405 					log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
406 					io_u->error = -1;
407 					assert(0);
408 				}
409 			} else {
410 				log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
411 				io_u->error = -1;
412 				assert(0);
413 				break;
414 			}
415 
416 			if (io_u->error != 0)
417 				break;
418 
419 			remaining -= sz;
420 			xfered += sz;
421 
422 			if (remaining != 0)
423 				log_info("Incomplete %s: %ld bytes remaining\n",
424 					 io_u->ddir == DDIR_READ? "read" : "write", remaining);
425 		}
426 
427 		if (io_u->error != 0)
428 			break;
429 
430 		if (io_u->ddir == DDIR_READ)
431 			rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
432 		break;
433 
434 	default:
435 		io_u->error = EINVAL;
436 		break;
437 	}
438 
439 	if (io_u->error != 0) {
440 		log_err("IO failed\n");
441 		td_verror(td, io_u->error, "xfer");
442 	}
443 
444 	return FIO_Q_COMPLETED;
445 }
446 
fio_libcufile_open_file(struct thread_data * td,struct fio_file * f)447 static int fio_libcufile_open_file(struct thread_data *td, struct fio_file *f)
448 {
449 	struct libcufile_options *o = td->eo;
450 	struct fio_libcufile_data *fcd = NULL;
451 	int rc;
452 	CUfileError_t status;
453 
454 	rc = generic_open_file(td, f);
455 	if (rc)
456 		return rc;
457 
458 	if (o->cuda_io == IO_CUFILE) {
459 		fcd = calloc(1, sizeof(*fcd));
460 		if (fcd == NULL) {
461 			rc = ENOMEM;
462 			goto exit_err;
463 		}
464 
465 		fcd->cf_descr.handle.fd = f->fd;
466 		fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
467 		status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
468 		if (status.err != CU_FILE_SUCCESS) {
469 			log_err("cufile register: err=%d:%s\n", status.err,
470 				fio_libcufile_get_cuda_error(status));
471 			rc = EINVAL;
472 			goto exit_err;
473 		}
474 	}
475 
476 	FILE_SET_ENG_DATA(f, fcd);
477 	return 0;
478 
479 exit_err:
480 	if (fcd) {
481 		free(fcd);
482 		fcd = NULL;
483 	}
484 	if (f) {
485 		int rc2 = generic_close_file(td, f);
486 		if (rc2)
487 			log_err("generic_close_file: err=%d\n", rc2);
488 	}
489 	return rc;
490 }
491 
fio_libcufile_close_file(struct thread_data * td,struct fio_file * f)492 static int fio_libcufile_close_file(struct thread_data *td, struct fio_file *f)
493 {
494 	struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
495 	int rc;
496 
497 	if (fcd != NULL) {
498 		cuFileHandleDeregister(fcd->cf_handle);
499 		FILE_SET_ENG_DATA(f, NULL);
500 		free(fcd);
501 	}
502 
503 	rc = generic_close_file(td, f);
504 
505 	return rc;
506 }
507 
fio_libcufile_iomem_alloc(struct thread_data * td,size_t total_mem)508 static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
509 {
510 	struct libcufile_options *o = td->eo;
511 	int rc;
512 	CUfileError_t status;
513 
514 	o->total_mem = total_mem;
515 	o->logged = 0;
516 	o->cu_mem_ptr = NULL;
517 	o->junk_buf = NULL;
518 	td->orig_buffer = calloc(1, total_mem);
519 	if (!td->orig_buffer) {
520 		log_err("orig_buffer calloc failed: err=%d\n", errno);
521 		goto exit_error;
522 	}
523 
524 	if (o->cuda_io == IO_POSIX) {
525 		o->junk_buf = calloc(1, total_mem);
526 		if (o->junk_buf == NULL) {
527 			log_err("junk_buf calloc failed: err=%d\n", errno);
528 			goto exit_error;
529 		}
530 	}
531 
532 	dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
533 	check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
534 	if (rc != 0)
535 		goto exit_error;
536 	check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
537 	if (rc != 0)
538 		goto exit_error;
539 
540 	if (o->cuda_io == IO_CUFILE) {
541 		status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
542 		if (status.err != CU_FILE_SUCCESS) {
543 			log_err("cuFileBufRegister: err=%d:%s\n", status.err,
544 				fio_libcufile_get_cuda_error(status));
545 			goto exit_error;
546 		}
547 	}
548 
549 	return 0;
550 
551 exit_error:
552 	if (td->orig_buffer) {
553 		free(td->orig_buffer);
554 		td->orig_buffer = NULL;
555 	}
556 	if (o->junk_buf) {
557 		free(o->junk_buf);
558 		o->junk_buf = NULL;
559 	}
560 	if (o->cu_mem_ptr) {
561 		cudaFree(o->cu_mem_ptr);
562 		o->cu_mem_ptr = NULL;
563 	}
564 	return 1;
565 }
566 
fio_libcufile_iomem_free(struct thread_data * td)567 static void fio_libcufile_iomem_free(struct thread_data *td)
568 {
569 	struct libcufile_options *o = td->eo;
570 
571 	if (o->junk_buf) {
572 		free(o->junk_buf);
573 		o->junk_buf = NULL;
574 	}
575 	if (o->cu_mem_ptr) {
576 		if (o->cuda_io == IO_CUFILE)
577 			cuFileBufDeregister(o->cu_mem_ptr);
578 		cudaFree(o->cu_mem_ptr);
579 		o->cu_mem_ptr = NULL;
580 	}
581 	if (td->orig_buffer) {
582 		free(td->orig_buffer);
583 		td->orig_buffer = NULL;
584 	}
585 }
586 
fio_libcufile_cleanup(struct thread_data * td)587 static void fio_libcufile_cleanup(struct thread_data *td)
588 {
589 	struct libcufile_options *o = td->eo;
590 
591 	pthread_mutex_lock(&running_lock);
592 	running--;
593 	assert(running >= 0);
594 	if (running == 0) {
595 		/* only close the driver if initialized and
596 		   this is the last worker thread */
597 		if (o->cuda_io == IO_CUFILE && cufile_initialized)
598 			cuFileDriverClose();
599 		cufile_initialized = 0;
600 	}
601 	pthread_mutex_unlock(&running_lock);
602 }
603 
604 FIO_STATIC struct ioengine_ops ioengine = {
605 	.name                = "libcufile",
606 	.version             = FIO_IOOPS_VERSION,
607 	.init                = fio_libcufile_init,
608 	.queue               = fio_libcufile_queue,
609 	.open_file           = fio_libcufile_open_file,
610 	.close_file          = fio_libcufile_close_file,
611 	.iomem_alloc         = fio_libcufile_iomem_alloc,
612 	.iomem_free          = fio_libcufile_iomem_free,
613 	.cleanup             = fio_libcufile_cleanup,
614 	.flags               = FIO_SYNCIO,
615 	.options             = options,
616 	.option_struct_size  = sizeof(struct libcufile_options)
617 };
618 
fio_libcufile_register(void)619 void fio_init fio_libcufile_register(void)
620 {
621 	register_ioengine(&ioengine);
622 }
623 
fio_libcufile_unregister(void)624 void fio_exit fio_libcufile_unregister(void)
625 {
626 	unregister_ioengine(&ioengine);
627 }
628