1 /*
2 * Copyright (C) by Argonne National Laboratory
3 *     See COPYRIGHT in top-level directory
4 */
5 
6 #include <assert.h>
7 #include <cuda.h>
8 #include <cuda_runtime_api.h>
9 #include "yaksi.h"
10 #include "yaksuri_cudai.h"
11 #include <stdlib.h>
12 
13 #define THREAD_BLOCK_SIZE  (256)
14 #define MAX_GRIDSZ_X       ((1U << 31) - 1)
15 #define MAX_GRIDSZ_Y       (65535U)
16 #define MAX_GRIDSZ_Z       (65535U)
17 
18 #define MAX_IOV_LENGTH (8192)
19 
get_thread_block_dims(uintptr_t count,yaksi_type_s * type,unsigned int * n_threads,unsigned int * n_blocks_x,unsigned int * n_blocks_y,unsigned int * n_blocks_z)20 static int get_thread_block_dims(uintptr_t count, yaksi_type_s * type, unsigned int *n_threads,
21                                  unsigned int *n_blocks_x, unsigned int *n_blocks_y,
22                                  unsigned int *n_blocks_z)
23 {
24     int rc = YAKSA_SUCCESS;
25     yaksuri_cudai_type_s *cuda_type = (yaksuri_cudai_type_s *) type->backend.cuda.priv;
26 
27     *n_threads = THREAD_BLOCK_SIZE;
28     uintptr_t n_blocks = count * cuda_type->num_elements / THREAD_BLOCK_SIZE;
29     n_blocks += ! !(count * cuda_type->num_elements % THREAD_BLOCK_SIZE);
30 
31     if (n_blocks <= MAX_GRIDSZ_X) {
32         *n_blocks_x = (unsigned int) n_blocks;
33         *n_blocks_y = 1;
34         *n_blocks_z = 1;
35     } else if (n_blocks <= MAX_GRIDSZ_X * MAX_GRIDSZ_Y) {
36         *n_blocks_x = (unsigned int) (YAKSU_CEIL(n_blocks, MAX_GRIDSZ_Y));
37         *n_blocks_y = (unsigned int) (YAKSU_CEIL(n_blocks, (*n_blocks_x)));
38         *n_blocks_z = 1;
39     } else {
40         uintptr_t n_blocks_xy = YAKSU_CEIL(n_blocks, MAX_GRIDSZ_Z);
41         *n_blocks_x = (unsigned int) (YAKSU_CEIL(n_blocks_xy, MAX_GRIDSZ_Y));
42         *n_blocks_y = (unsigned int) (YAKSU_CEIL(n_blocks_xy, (*n_blocks_x)));
43         *n_blocks_z =
44             (unsigned int) (YAKSU_CEIL(n_blocks, (uintptr_t) (*n_blocks_x) * (*n_blocks_y)));
45     }
46 
47     return rc;
48 }
49 
yaksuri_cudai_pup_is_supported(yaksi_type_s * type,bool * is_supported)50 int yaksuri_cudai_pup_is_supported(yaksi_type_s * type, bool * is_supported)
51 {
52     int rc = YAKSA_SUCCESS;
53     yaksuri_cudai_type_s *cuda_type = (yaksuri_cudai_type_s *) type->backend.cuda.priv;
54 
55     if (type->is_contig || cuda_type->pack)
56         *is_supported = true;
57     else
58         *is_supported = false;
59 
60     return rc;
61 }
62 
yaksuri_cudai_get_iov_pack_threshold(yaksi_info_s * info)63 uintptr_t yaksuri_cudai_get_iov_pack_threshold(yaksi_info_s * info)
64 {
65     uintptr_t iov_pack_threshold = YAKSURI_CUDAI_INFO__DEFAULT_IOV_PUP_THRESHOLD;
66     if (info) {
67         yaksuri_cudai_info_s *cuda_info = (yaksuri_cudai_info_s *) info->backend.cuda.priv;
68         iov_pack_threshold = cuda_info->iov_pack_threshold;
69     }
70 
71     return iov_pack_threshold;
72 }
73 
yaksuri_cudai_get_iov_unpack_threshold(yaksi_info_s * info)74 uintptr_t yaksuri_cudai_get_iov_unpack_threshold(yaksi_info_s * info)
75 {
76     uintptr_t iov_unpack_threshold = YAKSURI_CUDAI_INFO__DEFAULT_IOV_PUP_THRESHOLD;
77     if (info) {
78         yaksuri_cudai_info_s *cuda_info = (yaksuri_cudai_info_s *) info->backend.cuda.priv;
79         iov_unpack_threshold = cuda_info->iov_unpack_threshold;
80     }
81 
82     return iov_unpack_threshold;
83 }
84 
yaksuri_cudai_ipack(const void * inbuf,void * outbuf,uintptr_t count,yaksi_type_s * type,yaksi_info_s * info,int target)85 int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_type_s * type,
86                         yaksi_info_s * info, int target)
87 {
88     int rc = YAKSA_SUCCESS;
89     yaksuri_cudai_type_s *cuda_type = (yaksuri_cudai_type_s *) type->backend.cuda.priv;
90     cudaError_t cerr;
91 
92     uintptr_t iov_pack_threshold = yaksuri_cudai_get_iov_pack_threshold(info);
93 
94     /* shortcut for contiguous types */
95     if (type->is_contig) {
96         /* cuda performance is optimized when we synchronize on the
97          * source buffer's GPU */
98         cerr =
99             cudaMemcpyAsync(outbuf, (const char *) inbuf + type->true_lb, count * type->size,
100                             cudaMemcpyDefault, yaksuri_cudai_global.stream[target]);
101         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
102     } else if (type->size / type->num_contig >= iov_pack_threshold) {
103         struct iovec iov[MAX_IOV_LENGTH];
104         char *dbuf = (char *) outbuf;
105         uintptr_t offset = 0;
106 
107         while (offset < type->num_contig * count) {
108             uintptr_t actual_iov_len;
109             rc = yaksi_iov(inbuf, count, type, offset, iov, MAX_IOV_LENGTH, &actual_iov_len);
110             YAKSU_ERR_CHECK(rc, fn_fail);
111 
112             for (uintptr_t i = 0; i < actual_iov_len; i++) {
113                 cudaMemcpyAsync(dbuf, iov[i].iov_base, iov[i].iov_len, cudaMemcpyDefault,
114                                 yaksuri_cudai_global.stream[target]);
115                 dbuf += iov[i].iov_len;
116             }
117 
118             offset += actual_iov_len;
119         }
120     } else {
121         rc = yaksuri_cudai_md_alloc(type);
122         YAKSU_ERR_CHECK(rc, fn_fail);
123 
124         unsigned int n_threads;
125         unsigned int n_blocks_x, n_blocks_y, n_blocks_z;
126         rc = get_thread_block_dims(count, type, &n_threads, &n_blocks_x, &n_blocks_y, &n_blocks_z);
127         YAKSU_ERR_CHECK(rc, fn_fail);
128 
129         int cur_device;
130         cerr = cudaGetDevice(&cur_device);
131         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
132 
133         cerr = cudaSetDevice(target);
134         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
135 
136         cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, n_blocks_y,
137                         n_blocks_z, target);
138 
139         cerr = cudaSetDevice(cur_device);
140         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
141     }
142 
143   fn_exit:
144     return rc;
145   fn_fail:
146     goto fn_exit;
147 }
148 
yaksuri_cudai_iunpack(const void * inbuf,void * outbuf,uintptr_t count,yaksi_type_s * type,yaksi_info_s * info,int target)149 int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_type_s * type,
150                           yaksi_info_s * info, int target)
151 {
152     int rc = YAKSA_SUCCESS;
153     yaksuri_cudai_type_s *cuda_type = (yaksuri_cudai_type_s *) type->backend.cuda.priv;
154     cudaError_t cerr;
155 
156     uintptr_t iov_unpack_threshold = yaksuri_cudai_get_iov_unpack_threshold(info);
157 
158     /* shortcut for contiguous types */
159     if (type->is_contig) {
160         /* cuda performance is optimized when we synchronize on the
161          * source buffer's GPU */
162         cerr =
163             cudaMemcpyAsync((char *) outbuf + type->true_lb, inbuf, count * type->size,
164                             cudaMemcpyDefault, yaksuri_cudai_global.stream[target]);
165         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
166     } else if (type->size / type->num_contig >= iov_unpack_threshold) {
167         struct iovec iov[MAX_IOV_LENGTH];
168         const char *sbuf = (const char *) inbuf;
169         uintptr_t offset = 0;
170 
171         while (offset < type->num_contig * count) {
172             uintptr_t actual_iov_len;
173             rc = yaksi_iov(outbuf, count, type, offset, iov, MAX_IOV_LENGTH, &actual_iov_len);
174             YAKSU_ERR_CHECK(rc, fn_fail);
175 
176             for (uintptr_t i = 0; i < actual_iov_len; i++) {
177                 cudaMemcpyAsync(iov[i].iov_base, sbuf, iov[i].iov_len, cudaMemcpyDefault,
178                                 yaksuri_cudai_global.stream[target]);
179                 sbuf += iov[i].iov_len;
180             }
181 
182             offset += actual_iov_len;
183         }
184     } else {
185         rc = yaksuri_cudai_md_alloc(type);
186         YAKSU_ERR_CHECK(rc, fn_fail);
187 
188         unsigned int n_threads;
189         unsigned int n_blocks_x, n_blocks_y, n_blocks_z;
190         rc = get_thread_block_dims(count, type, &n_threads, &n_blocks_x, &n_blocks_y, &n_blocks_z);
191         YAKSU_ERR_CHECK(rc, fn_fail);
192 
193         int cur_device;
194         cerr = cudaGetDevice(&cur_device);
195         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
196 
197         cerr = cudaSetDevice(target);
198         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
199 
200         cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x,
201                           n_blocks_y, n_blocks_z, target);
202 
203         cerr = cudaSetDevice(cur_device);
204         YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
205     }
206 
207   fn_exit:
208     return rc;
209   fn_fail:
210     goto fn_exit;
211 }
212