1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 */
5
6 #include <assert.h>
7 #include <cuda.h>
8 #include <cuda_runtime_api.h>
9 #include "yaksi.h"
10 #include "yaksuri_cudai.h"
11 #include <stdlib.h>
12
13 #define THREAD_BLOCK_SIZE (256)
14 #define MAX_GRIDSZ_X ((1U << 31) - 1)
15 #define MAX_GRIDSZ_Y (65535U)
16 #define MAX_GRIDSZ_Z (65535U)
17
18 #define MAX_IOV_LENGTH (8192)
19
get_thread_block_dims(uintptr_t count,yaksi_type_s * type,unsigned int * n_threads,unsigned int * n_blocks_x,unsigned int * n_blocks_y,unsigned int * n_blocks_z)20 static int get_thread_block_dims(uintptr_t count, yaksi_type_s * type, unsigned int *n_threads,
21 unsigned int *n_blocks_x, unsigned int *n_blocks_y,
22 unsigned int *n_blocks_z)
23 {
24 int rc = YAKSA_SUCCESS;
25 yaksuri_cudai_type_s *cuda_type = (yaksuri_cudai_type_s *) type->backend.cuda.priv;
26
27 *n_threads = THREAD_BLOCK_SIZE;
28 uintptr_t n_blocks = count * cuda_type->num_elements / THREAD_BLOCK_SIZE;
29 n_blocks += ! !(count * cuda_type->num_elements % THREAD_BLOCK_SIZE);
30
31 if (n_blocks <= MAX_GRIDSZ_X) {
32 *n_blocks_x = (unsigned int) n_blocks;
33 *n_blocks_y = 1;
34 *n_blocks_z = 1;
35 } else if (n_blocks <= MAX_GRIDSZ_X * MAX_GRIDSZ_Y) {
36 *n_blocks_x = (unsigned int) (YAKSU_CEIL(n_blocks, MAX_GRIDSZ_Y));
37 *n_blocks_y = (unsigned int) (YAKSU_CEIL(n_blocks, (*n_blocks_x)));
38 *n_blocks_z = 1;
39 } else {
40 uintptr_t n_blocks_xy = YAKSU_CEIL(n_blocks, MAX_GRIDSZ_Z);
41 *n_blocks_x = (unsigned int) (YAKSU_CEIL(n_blocks_xy, MAX_GRIDSZ_Y));
42 *n_blocks_y = (unsigned int) (YAKSU_CEIL(n_blocks_xy, (*n_blocks_x)));
43 *n_blocks_z =
44 (unsigned int) (YAKSU_CEIL(n_blocks, (uintptr_t) (*n_blocks_x) * (*n_blocks_y)));
45 }
46
47 return rc;
48 }
49
yaksuri_cudai_pup_is_supported(yaksi_type_s * type,bool * is_supported)50 int yaksuri_cudai_pup_is_supported(yaksi_type_s * type, bool * is_supported)
51 {
52 int rc = YAKSA_SUCCESS;
53 yaksuri_cudai_type_s *cuda_type = (yaksuri_cudai_type_s *) type->backend.cuda.priv;
54
55 if (type->is_contig || cuda_type->pack)
56 *is_supported = true;
57 else
58 *is_supported = false;
59
60 return rc;
61 }
62
yaksuri_cudai_get_iov_pack_threshold(yaksi_info_s * info)63 uintptr_t yaksuri_cudai_get_iov_pack_threshold(yaksi_info_s * info)
64 {
65 uintptr_t iov_pack_threshold = YAKSURI_CUDAI_INFO__DEFAULT_IOV_PUP_THRESHOLD;
66 if (info) {
67 yaksuri_cudai_info_s *cuda_info = (yaksuri_cudai_info_s *) info->backend.cuda.priv;
68 iov_pack_threshold = cuda_info->iov_pack_threshold;
69 }
70
71 return iov_pack_threshold;
72 }
73
yaksuri_cudai_get_iov_unpack_threshold(yaksi_info_s * info)74 uintptr_t yaksuri_cudai_get_iov_unpack_threshold(yaksi_info_s * info)
75 {
76 uintptr_t iov_unpack_threshold = YAKSURI_CUDAI_INFO__DEFAULT_IOV_PUP_THRESHOLD;
77 if (info) {
78 yaksuri_cudai_info_s *cuda_info = (yaksuri_cudai_info_s *) info->backend.cuda.priv;
79 iov_unpack_threshold = cuda_info->iov_unpack_threshold;
80 }
81
82 return iov_unpack_threshold;
83 }
84
yaksuri_cudai_ipack(const void * inbuf,void * outbuf,uintptr_t count,yaksi_type_s * type,yaksi_info_s * info,int target)85 int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_type_s * type,
86 yaksi_info_s * info, int target)
87 {
88 int rc = YAKSA_SUCCESS;
89 yaksuri_cudai_type_s *cuda_type = (yaksuri_cudai_type_s *) type->backend.cuda.priv;
90 cudaError_t cerr;
91
92 uintptr_t iov_pack_threshold = yaksuri_cudai_get_iov_pack_threshold(info);
93
94 /* shortcut for contiguous types */
95 if (type->is_contig) {
96 /* cuda performance is optimized when we synchronize on the
97 * source buffer's GPU */
98 cerr =
99 cudaMemcpyAsync(outbuf, (const char *) inbuf + type->true_lb, count * type->size,
100 cudaMemcpyDefault, yaksuri_cudai_global.stream[target]);
101 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
102 } else if (type->size / type->num_contig >= iov_pack_threshold) {
103 struct iovec iov[MAX_IOV_LENGTH];
104 char *dbuf = (char *) outbuf;
105 uintptr_t offset = 0;
106
107 while (offset < type->num_contig * count) {
108 uintptr_t actual_iov_len;
109 rc = yaksi_iov(inbuf, count, type, offset, iov, MAX_IOV_LENGTH, &actual_iov_len);
110 YAKSU_ERR_CHECK(rc, fn_fail);
111
112 for (uintptr_t i = 0; i < actual_iov_len; i++) {
113 cudaMemcpyAsync(dbuf, iov[i].iov_base, iov[i].iov_len, cudaMemcpyDefault,
114 yaksuri_cudai_global.stream[target]);
115 dbuf += iov[i].iov_len;
116 }
117
118 offset += actual_iov_len;
119 }
120 } else {
121 rc = yaksuri_cudai_md_alloc(type);
122 YAKSU_ERR_CHECK(rc, fn_fail);
123
124 unsigned int n_threads;
125 unsigned int n_blocks_x, n_blocks_y, n_blocks_z;
126 rc = get_thread_block_dims(count, type, &n_threads, &n_blocks_x, &n_blocks_y, &n_blocks_z);
127 YAKSU_ERR_CHECK(rc, fn_fail);
128
129 int cur_device;
130 cerr = cudaGetDevice(&cur_device);
131 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
132
133 cerr = cudaSetDevice(target);
134 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
135
136 cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, n_blocks_y,
137 n_blocks_z, target);
138
139 cerr = cudaSetDevice(cur_device);
140 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
141 }
142
143 fn_exit:
144 return rc;
145 fn_fail:
146 goto fn_exit;
147 }
148
yaksuri_cudai_iunpack(const void * inbuf,void * outbuf,uintptr_t count,yaksi_type_s * type,yaksi_info_s * info,int target)149 int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_type_s * type,
150 yaksi_info_s * info, int target)
151 {
152 int rc = YAKSA_SUCCESS;
153 yaksuri_cudai_type_s *cuda_type = (yaksuri_cudai_type_s *) type->backend.cuda.priv;
154 cudaError_t cerr;
155
156 uintptr_t iov_unpack_threshold = yaksuri_cudai_get_iov_unpack_threshold(info);
157
158 /* shortcut for contiguous types */
159 if (type->is_contig) {
160 /* cuda performance is optimized when we synchronize on the
161 * source buffer's GPU */
162 cerr =
163 cudaMemcpyAsync((char *) outbuf + type->true_lb, inbuf, count * type->size,
164 cudaMemcpyDefault, yaksuri_cudai_global.stream[target]);
165 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
166 } else if (type->size / type->num_contig >= iov_unpack_threshold) {
167 struct iovec iov[MAX_IOV_LENGTH];
168 const char *sbuf = (const char *) inbuf;
169 uintptr_t offset = 0;
170
171 while (offset < type->num_contig * count) {
172 uintptr_t actual_iov_len;
173 rc = yaksi_iov(outbuf, count, type, offset, iov, MAX_IOV_LENGTH, &actual_iov_len);
174 YAKSU_ERR_CHECK(rc, fn_fail);
175
176 for (uintptr_t i = 0; i < actual_iov_len; i++) {
177 cudaMemcpyAsync(iov[i].iov_base, sbuf, iov[i].iov_len, cudaMemcpyDefault,
178 yaksuri_cudai_global.stream[target]);
179 sbuf += iov[i].iov_len;
180 }
181
182 offset += actual_iov_len;
183 }
184 } else {
185 rc = yaksuri_cudai_md_alloc(type);
186 YAKSU_ERR_CHECK(rc, fn_fail);
187
188 unsigned int n_threads;
189 unsigned int n_blocks_x, n_blocks_y, n_blocks_z;
190 rc = get_thread_block_dims(count, type, &n_threads, &n_blocks_x, &n_blocks_y, &n_blocks_z);
191 YAKSU_ERR_CHECK(rc, fn_fail);
192
193 int cur_device;
194 cerr = cudaGetDevice(&cur_device);
195 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
196
197 cerr = cudaSetDevice(target);
198 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
199
200 cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x,
201 n_blocks_y, n_blocks_z, target);
202
203 cerr = cudaSetDevice(cur_device);
204 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
205 }
206
207 fn_exit:
208 return rc;
209 fn_fail:
210 goto fn_exit;
211 }
212