1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res;
30
31 *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent));
32 }
33
yaksuri_cudai_pack_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)34 void yaksuri_cudai_pack_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
35 {
36 void *args[] = { &inbuf, &outbuf, &count, &md };
37 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_resized_double,
38 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
39 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
40 }
41
yaksuri_cudai_kernel_unpack_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)42 __global__ void yaksuri_cudai_kernel_unpack_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
43 {
44 const char *__restrict__ sbuf = (const char *) inbuf;
45 char *__restrict__ dbuf = (char *) outbuf;
46 uintptr_t extent = md->extent;
47 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
48 uintptr_t res = idx;
49 uintptr_t inner_elements = md->num_elements;
50
51 if (idx >= (count * inner_elements))
52 return;
53
54 uintptr_t x0 = res;
55
56 *((double *) (void *) (dbuf + x0 * extent)) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
57 }
58
yaksuri_cudai_unpack_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)59 void yaksuri_cudai_unpack_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
60 {
61 void *args[] = { &inbuf, &outbuf, &count, &md };
62 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_resized_double,
63 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
64 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
65 }
66
yaksuri_cudai_kernel_pack_hvector_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)67 __global__ void yaksuri_cudai_kernel_pack_hvector_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
68 {
69 const char *__restrict__ sbuf = (const char *) inbuf;
70 char *__restrict__ dbuf = (char *) outbuf;
71 uintptr_t extent = md->extent;
72 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
73 uintptr_t res = idx;
74 uintptr_t inner_elements = md->num_elements;
75
76 if (idx >= (count * inner_elements))
77 return;
78
79 uintptr_t x0 = res / inner_elements;
80 res %= inner_elements;
81 inner_elements /= md->u.hvector.count;
82
83 uintptr_t x1 = res / inner_elements;
84 res %= inner_elements;
85 inner_elements /= md->u.hvector.blocklength;
86 uintptr_t x2 = res;
87
88 intptr_t stride1 = md->u.hvector.stride;
89 uintptr_t extent2 = md->u.hvector.child->extent;
90 *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2));
91 }
92
yaksuri_cudai_pack_hvector_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)93 void yaksuri_cudai_pack_hvector_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
94 {
95 void *args[] = { &inbuf, &outbuf, &count, &md };
96 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_resized_resized_double,
97 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
98 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
99 }
100
yaksuri_cudai_kernel_unpack_hvector_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)101 __global__ void yaksuri_cudai_kernel_unpack_hvector_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
102 {
103 const char *__restrict__ sbuf = (const char *) inbuf;
104 char *__restrict__ dbuf = (char *) outbuf;
105 uintptr_t extent = md->extent;
106 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
107 uintptr_t res = idx;
108 uintptr_t inner_elements = md->num_elements;
109
110 if (idx >= (count * inner_elements))
111 return;
112
113 uintptr_t x0 = res / inner_elements;
114 res %= inner_elements;
115 inner_elements /= md->u.hvector.count;
116
117 uintptr_t x1 = res / inner_elements;
118 res %= inner_elements;
119 inner_elements /= md->u.hvector.blocklength;
120 uintptr_t x2 = res;
121
122 intptr_t stride1 = md->u.hvector.stride;
123 uintptr_t extent2 = md->u.hvector.child->extent;
124 *((double *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2)) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
125 }
126
yaksuri_cudai_unpack_hvector_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)127 void yaksuri_cudai_unpack_hvector_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
128 {
129 void *args[] = { &inbuf, &outbuf, &count, &md };
130 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_resized_resized_double,
131 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
132 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
133 }
134
yaksuri_cudai_kernel_pack_blkhindx_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)135 __global__ void yaksuri_cudai_kernel_pack_blkhindx_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
136 {
137 const char *__restrict__ sbuf = (const char *) inbuf;
138 char *__restrict__ dbuf = (char *) outbuf;
139 uintptr_t extent = md->extent;
140 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
141 uintptr_t res = idx;
142 uintptr_t inner_elements = md->num_elements;
143
144 if (idx >= (count * inner_elements))
145 return;
146
147 uintptr_t x0 = res / inner_elements;
148 res %= inner_elements;
149 inner_elements /= md->u.blkhindx.count;
150
151 uintptr_t x1 = res / inner_elements;
152 res %= inner_elements;
153 inner_elements /= md->u.blkhindx.blocklength;
154 uintptr_t x2 = res;
155
156 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
157 uintptr_t extent2 = md->u.blkhindx.child->extent;
158 *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2));
159 }
160
yaksuri_cudai_pack_blkhindx_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)161 void yaksuri_cudai_pack_blkhindx_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
162 {
163 void *args[] = { &inbuf, &outbuf, &count, &md };
164 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_resized_resized_double,
165 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
166 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
167 }
168
yaksuri_cudai_kernel_unpack_blkhindx_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)169 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
170 {
171 const char *__restrict__ sbuf = (const char *) inbuf;
172 char *__restrict__ dbuf = (char *) outbuf;
173 uintptr_t extent = md->extent;
174 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
175 uintptr_t res = idx;
176 uintptr_t inner_elements = md->num_elements;
177
178 if (idx >= (count * inner_elements))
179 return;
180
181 uintptr_t x0 = res / inner_elements;
182 res %= inner_elements;
183 inner_elements /= md->u.blkhindx.count;
184
185 uintptr_t x1 = res / inner_elements;
186 res %= inner_elements;
187 inner_elements /= md->u.blkhindx.blocklength;
188 uintptr_t x2 = res;
189
190 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
191 uintptr_t extent2 = md->u.blkhindx.child->extent;
192 *((double *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2)) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
193 }
194
yaksuri_cudai_unpack_blkhindx_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)195 void yaksuri_cudai_unpack_blkhindx_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
196 {
197 void *args[] = { &inbuf, &outbuf, &count, &md };
198 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_resized_resized_double,
199 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
200 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
201 }
202
yaksuri_cudai_kernel_pack_hindexed_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)203 __global__ void yaksuri_cudai_kernel_pack_hindexed_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
204 {
205 const char *__restrict__ sbuf = (const char *) inbuf;
206 char *__restrict__ dbuf = (char *) outbuf;
207 uintptr_t extent = md->extent;
208 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
209 uintptr_t res = idx;
210 uintptr_t inner_elements = md->num_elements;
211
212 if (idx >= (count * inner_elements))
213 return;
214
215 uintptr_t x0 = res / inner_elements;
216 res %= inner_elements;
217 inner_elements /= md->u.hindexed.count;
218
219 uintptr_t x1;
220 for (int i = 0; i < md->u.hindexed.count; i++) {
221 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
222 md->u.hindexed.child->num_elements;
223 if (res < in_elems) {
224 x1 = i;
225 res %= in_elems;
226 inner_elements = md->u.hindexed.child->num_elements;
227 break;
228 } else {
229 res -= in_elems;
230 }
231 }
232
233 uintptr_t x2 = res;
234
235 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
236 uintptr_t extent2 = md->u.hindexed.child->extent;
237 *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2));
238 }
239
yaksuri_cudai_pack_hindexed_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)240 void yaksuri_cudai_pack_hindexed_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
241 {
242 void *args[] = { &inbuf, &outbuf, &count, &md };
243 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_resized_resized_double,
244 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
245 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
246 }
247
yaksuri_cudai_kernel_unpack_hindexed_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)248 __global__ void yaksuri_cudai_kernel_unpack_hindexed_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
249 {
250 const char *__restrict__ sbuf = (const char *) inbuf;
251 char *__restrict__ dbuf = (char *) outbuf;
252 uintptr_t extent = md->extent;
253 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
254 uintptr_t res = idx;
255 uintptr_t inner_elements = md->num_elements;
256
257 if (idx >= (count * inner_elements))
258 return;
259
260 uintptr_t x0 = res / inner_elements;
261 res %= inner_elements;
262 inner_elements /= md->u.hindexed.count;
263
264 uintptr_t x1;
265 for (int i = 0; i < md->u.hindexed.count; i++) {
266 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
267 md->u.hindexed.child->num_elements;
268 if (res < in_elems) {
269 x1 = i;
270 res %= in_elems;
271 inner_elements = md->u.hindexed.child->num_elements;
272 break;
273 } else {
274 res -= in_elems;
275 }
276 }
277
278 uintptr_t x2 = res;
279
280 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
281 uintptr_t extent2 = md->u.hindexed.child->extent;
282 *((double *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2)) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
283 }
284
yaksuri_cudai_unpack_hindexed_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)285 void yaksuri_cudai_unpack_hindexed_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
286 {
287 void *args[] = { &inbuf, &outbuf, &count, &md };
288 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_resized_resized_double,
289 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
290 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
291 }
292
yaksuri_cudai_kernel_pack_contig_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)293 __global__ void yaksuri_cudai_kernel_pack_contig_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
294 {
295 const char *__restrict__ sbuf = (const char *) inbuf;
296 char *__restrict__ dbuf = (char *) outbuf;
297 uintptr_t extent = md->extent;
298 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
299 uintptr_t res = idx;
300 uintptr_t inner_elements = md->num_elements;
301
302 if (idx >= (count * inner_elements))
303 return;
304
305 uintptr_t x0 = res / inner_elements;
306 res %= inner_elements;
307 inner_elements /= md->u.contig.count;
308
309 uintptr_t x1 = res;
310
311 intptr_t stride1 = md->u.contig.child->extent;
312 *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + x1 * stride1));
313 }
314
yaksuri_cudai_pack_contig_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)315 void yaksuri_cudai_pack_contig_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
316 {
317 void *args[] = { &inbuf, &outbuf, &count, &md };
318 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_resized_resized_double,
319 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
320 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
321 }
322
yaksuri_cudai_kernel_unpack_contig_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)323 __global__ void yaksuri_cudai_kernel_unpack_contig_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
324 {
325 const char *__restrict__ sbuf = (const char *) inbuf;
326 char *__restrict__ dbuf = (char *) outbuf;
327 uintptr_t extent = md->extent;
328 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
329 uintptr_t res = idx;
330 uintptr_t inner_elements = md->num_elements;
331
332 if (idx >= (count * inner_elements))
333 return;
334
335 uintptr_t x0 = res / inner_elements;
336 res %= inner_elements;
337 inner_elements /= md->u.contig.count;
338
339 uintptr_t x1 = res;
340
341 intptr_t stride1 = md->u.contig.child->extent;
342 *((double *) (void *) (dbuf + x0 * extent + x1 * stride1)) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
343 }
344
yaksuri_cudai_unpack_contig_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)345 void yaksuri_cudai_unpack_contig_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
346 {
347 void *args[] = { &inbuf, &outbuf, &count, &md };
348 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_resized_resized_double,
349 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
350 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
351 }
352
yaksuri_cudai_kernel_pack_resized_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)353 __global__ void yaksuri_cudai_kernel_pack_resized_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
354 {
355 const char *__restrict__ sbuf = (const char *) inbuf;
356 char *__restrict__ dbuf = (char *) outbuf;
357 uintptr_t extent = md->extent;
358 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
359 uintptr_t res = idx;
360 uintptr_t inner_elements = md->num_elements;
361
362 if (idx >= (count * inner_elements))
363 return;
364
365 uintptr_t x0 = res;
366
367 *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent));
368 }
369
yaksuri_cudai_pack_resized_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)370 void yaksuri_cudai_pack_resized_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
371 {
372 void *args[] = { &inbuf, &outbuf, &count, &md };
373 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_resized_resized_double,
374 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
375 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
376 }
377
yaksuri_cudai_kernel_unpack_resized_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)378 __global__ void yaksuri_cudai_kernel_unpack_resized_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
379 {
380 const char *__restrict__ sbuf = (const char *) inbuf;
381 char *__restrict__ dbuf = (char *) outbuf;
382 uintptr_t extent = md->extent;
383 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
384 uintptr_t res = idx;
385 uintptr_t inner_elements = md->num_elements;
386
387 if (idx >= (count * inner_elements))
388 return;
389
390 uintptr_t x0 = res;
391
392 *((double *) (void *) (dbuf + x0 * extent)) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
393 }
394
yaksuri_cudai_unpack_resized_resized_resized_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)395 void yaksuri_cudai_unpack_resized_resized_resized_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
396 {
397 void *args[] = { &inbuf, &outbuf, &count, &md };
398 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_resized_resized_double,
399 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
400 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
401 }
402
403