1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.contig.count;
32 
33     uintptr_t x1 = res / inner_elements;
34     res %= inner_elements;
35     inner_elements /= md->u.contig.child->u.contig.count;
36 
37     uintptr_t x2 = res;
38 
39     intptr_t stride1 = md->u.contig.child->extent;
40     intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
41     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * stride2));
42 }
43 
yaksuri_cudai_pack_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)44 void yaksuri_cudai_pack_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
45 {
46 void *args[] = { &inbuf, &outbuf, &count, &md };
47     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_contig_float,
48         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
49     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
50 }
51 
yaksuri_cudai_kernel_unpack_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)52 __global__ void yaksuri_cudai_kernel_unpack_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
53 {
54     const char *__restrict__ sbuf = (const char *) inbuf;
55     char *__restrict__ dbuf = (char *) outbuf;
56     uintptr_t extent = md->extent;
57     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
58     uintptr_t res = idx;
59     uintptr_t inner_elements = md->num_elements;
60 
61     if (idx >= (count * inner_elements))
62         return;
63 
64     uintptr_t x0 = res / inner_elements;
65     res %= inner_elements;
66     inner_elements /= md->u.contig.count;
67 
68     uintptr_t x1 = res / inner_elements;
69     res %= inner_elements;
70     inner_elements /= md->u.contig.child->u.contig.count;
71 
72     uintptr_t x2 = res;
73 
74     intptr_t stride1 = md->u.contig.child->extent;
75     intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
76     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * stride2)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
77 }
78 
yaksuri_cudai_unpack_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)79 void yaksuri_cudai_unpack_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
80 {
81 void *args[] = { &inbuf, &outbuf, &count, &md };
82     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_contig_float,
83         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
84     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
85 }
86 
yaksuri_cudai_kernel_pack_hvector_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)87 __global__ void yaksuri_cudai_kernel_pack_hvector_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
88 {
89     const char *__restrict__ sbuf = (const char *) inbuf;
90     char *__restrict__ dbuf = (char *) outbuf;
91     uintptr_t extent = md->extent;
92     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
93     uintptr_t res = idx;
94     uintptr_t inner_elements = md->num_elements;
95 
96     if (idx >= (count * inner_elements))
97         return;
98 
99     uintptr_t x0 = res / inner_elements;
100     res %= inner_elements;
101     inner_elements /= md->u.hvector.count;
102 
103     uintptr_t x1 = res / inner_elements;
104     res %= inner_elements;
105     inner_elements /= md->u.hvector.blocklength;
106     uintptr_t x2 = res / inner_elements;
107     res %= inner_elements;
108     inner_elements /= md->u.hvector.child->u.contig.count;
109 
110     uintptr_t x3 = res / inner_elements;
111     res %= inner_elements;
112     inner_elements /= md->u.hvector.child->u.contig.child->u.contig.count;
113 
114     uintptr_t x4 = res;
115 
116     intptr_t stride1 = md->u.hvector.stride;
117     intptr_t stride2 = md->u.hvector.child->u.contig.child->extent;
118     uintptr_t extent2 = md->u.hvector.child->extent;
119     intptr_t stride3 = md->u.hvector.child->u.contig.child->u.contig.child->extent;
120     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + x4 * stride3));
121 }
122 
yaksuri_cudai_pack_hvector_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)123 void yaksuri_cudai_pack_hvector_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
124 {
125 void *args[] = { &inbuf, &outbuf, &count, &md };
126     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_contig_contig_float,
127         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
128     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
129 }
130 
yaksuri_cudai_kernel_unpack_hvector_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)131 __global__ void yaksuri_cudai_kernel_unpack_hvector_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
132 {
133     const char *__restrict__ sbuf = (const char *) inbuf;
134     char *__restrict__ dbuf = (char *) outbuf;
135     uintptr_t extent = md->extent;
136     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
137     uintptr_t res = idx;
138     uintptr_t inner_elements = md->num_elements;
139 
140     if (idx >= (count * inner_elements))
141         return;
142 
143     uintptr_t x0 = res / inner_elements;
144     res %= inner_elements;
145     inner_elements /= md->u.hvector.count;
146 
147     uintptr_t x1 = res / inner_elements;
148     res %= inner_elements;
149     inner_elements /= md->u.hvector.blocklength;
150     uintptr_t x2 = res / inner_elements;
151     res %= inner_elements;
152     inner_elements /= md->u.hvector.child->u.contig.count;
153 
154     uintptr_t x3 = res / inner_elements;
155     res %= inner_elements;
156     inner_elements /= md->u.hvector.child->u.contig.child->u.contig.count;
157 
158     uintptr_t x4 = res;
159 
160     intptr_t stride1 = md->u.hvector.stride;
161     intptr_t stride2 = md->u.hvector.child->u.contig.child->extent;
162     uintptr_t extent2 = md->u.hvector.child->extent;
163     intptr_t stride3 = md->u.hvector.child->u.contig.child->u.contig.child->extent;
164     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + x4 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
165 }
166 
yaksuri_cudai_unpack_hvector_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)167 void yaksuri_cudai_unpack_hvector_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
168 {
169 void *args[] = { &inbuf, &outbuf, &count, &md };
170     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_contig_contig_float,
171         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
172     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
173 }
174 
yaksuri_cudai_kernel_pack_blkhindx_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)175 __global__ void yaksuri_cudai_kernel_pack_blkhindx_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
176 {
177     const char *__restrict__ sbuf = (const char *) inbuf;
178     char *__restrict__ dbuf = (char *) outbuf;
179     uintptr_t extent = md->extent;
180     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
181     uintptr_t res = idx;
182     uintptr_t inner_elements = md->num_elements;
183 
184     if (idx >= (count * inner_elements))
185         return;
186 
187     uintptr_t x0 = res / inner_elements;
188     res %= inner_elements;
189     inner_elements /= md->u.blkhindx.count;
190 
191     uintptr_t x1 = res / inner_elements;
192     res %= inner_elements;
193     inner_elements /= md->u.blkhindx.blocklength;
194     uintptr_t x2 = res / inner_elements;
195     res %= inner_elements;
196     inner_elements /= md->u.blkhindx.child->u.contig.count;
197 
198     uintptr_t x3 = res / inner_elements;
199     res %= inner_elements;
200     inner_elements /= md->u.blkhindx.child->u.contig.child->u.contig.count;
201 
202     uintptr_t x4 = res;
203 
204     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
205     intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
206     uintptr_t extent2 = md->u.blkhindx.child->extent;
207     intptr_t stride3 = md->u.blkhindx.child->u.contig.child->u.contig.child->extent;
208     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * stride3));
209 }
210 
yaksuri_cudai_pack_blkhindx_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)211 void yaksuri_cudai_pack_blkhindx_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
212 {
213 void *args[] = { &inbuf, &outbuf, &count, &md };
214     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_contig_contig_float,
215         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
216     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
217 }
218 
yaksuri_cudai_kernel_unpack_blkhindx_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)219 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
220 {
221     const char *__restrict__ sbuf = (const char *) inbuf;
222     char *__restrict__ dbuf = (char *) outbuf;
223     uintptr_t extent = md->extent;
224     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
225     uintptr_t res = idx;
226     uintptr_t inner_elements = md->num_elements;
227 
228     if (idx >= (count * inner_elements))
229         return;
230 
231     uintptr_t x0 = res / inner_elements;
232     res %= inner_elements;
233     inner_elements /= md->u.blkhindx.count;
234 
235     uintptr_t x1 = res / inner_elements;
236     res %= inner_elements;
237     inner_elements /= md->u.blkhindx.blocklength;
238     uintptr_t x2 = res / inner_elements;
239     res %= inner_elements;
240     inner_elements /= md->u.blkhindx.child->u.contig.count;
241 
242     uintptr_t x3 = res / inner_elements;
243     res %= inner_elements;
244     inner_elements /= md->u.blkhindx.child->u.contig.child->u.contig.count;
245 
246     uintptr_t x4 = res;
247 
248     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
249     intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
250     uintptr_t extent2 = md->u.blkhindx.child->extent;
251     intptr_t stride3 = md->u.blkhindx.child->u.contig.child->u.contig.child->extent;
252     *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
253 }
254 
yaksuri_cudai_unpack_blkhindx_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)255 void yaksuri_cudai_unpack_blkhindx_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
256 {
257 void *args[] = { &inbuf, &outbuf, &count, &md };
258     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_contig_contig_float,
259         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
260     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
261 }
262 
yaksuri_cudai_kernel_pack_hindexed_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)263 __global__ void yaksuri_cudai_kernel_pack_hindexed_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
264 {
265     const char *__restrict__ sbuf = (const char *) inbuf;
266     char *__restrict__ dbuf = (char *) outbuf;
267     uintptr_t extent = md->extent;
268     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
269     uintptr_t res = idx;
270     uintptr_t inner_elements = md->num_elements;
271 
272     if (idx >= (count * inner_elements))
273         return;
274 
275     uintptr_t x0 = res / inner_elements;
276     res %= inner_elements;
277     inner_elements /= md->u.hindexed.count;
278 
279     uintptr_t x1;
280     for (int i = 0; i < md->u.hindexed.count; i++) {
281             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
282                                  md->u.hindexed.child->num_elements;
283             if (res < in_elems) {
284                     x1 = i;
285                     res %= in_elems;
286                     inner_elements = md->u.hindexed.child->num_elements;
287                     break;
288             } else {
289                     res -= in_elems;
290             }
291     }
292 
293     uintptr_t x2 = res / inner_elements;
294     res %= inner_elements;
295     inner_elements /= md->u.hindexed.child->u.contig.count;
296 
297     uintptr_t x3 = res / inner_elements;
298     res %= inner_elements;
299     inner_elements /= md->u.hindexed.child->u.contig.child->u.contig.count;
300 
301     uintptr_t x4 = res;
302 
303     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
304     intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
305     uintptr_t extent2 = md->u.hindexed.child->extent;
306     intptr_t stride3 = md->u.hindexed.child->u.contig.child->u.contig.child->extent;
307     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * stride3));
308 }
309 
yaksuri_cudai_pack_hindexed_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)310 void yaksuri_cudai_pack_hindexed_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
311 {
312 void *args[] = { &inbuf, &outbuf, &count, &md };
313     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_contig_contig_float,
314         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
315     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
316 }
317 
yaksuri_cudai_kernel_unpack_hindexed_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)318 __global__ void yaksuri_cudai_kernel_unpack_hindexed_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
319 {
320     const char *__restrict__ sbuf = (const char *) inbuf;
321     char *__restrict__ dbuf = (char *) outbuf;
322     uintptr_t extent = md->extent;
323     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
324     uintptr_t res = idx;
325     uintptr_t inner_elements = md->num_elements;
326 
327     if (idx >= (count * inner_elements))
328         return;
329 
330     uintptr_t x0 = res / inner_elements;
331     res %= inner_elements;
332     inner_elements /= md->u.hindexed.count;
333 
334     uintptr_t x1;
335     for (int i = 0; i < md->u.hindexed.count; i++) {
336             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
337                                  md->u.hindexed.child->num_elements;
338             if (res < in_elems) {
339                     x1 = i;
340                     res %= in_elems;
341                     inner_elements = md->u.hindexed.child->num_elements;
342                     break;
343             } else {
344                     res -= in_elems;
345             }
346     }
347 
348     uintptr_t x2 = res / inner_elements;
349     res %= inner_elements;
350     inner_elements /= md->u.hindexed.child->u.contig.count;
351 
352     uintptr_t x3 = res / inner_elements;
353     res %= inner_elements;
354     inner_elements /= md->u.hindexed.child->u.contig.child->u.contig.count;
355 
356     uintptr_t x4 = res;
357 
358     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
359     intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
360     uintptr_t extent2 = md->u.hindexed.child->extent;
361     intptr_t stride3 = md->u.hindexed.child->u.contig.child->u.contig.child->extent;
362     *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
363 }
364 
yaksuri_cudai_unpack_hindexed_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)365 void yaksuri_cudai_unpack_hindexed_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
366 {
367 void *args[] = { &inbuf, &outbuf, &count, &md };
368     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_contig_contig_float,
369         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
370     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
371 }
372 
yaksuri_cudai_kernel_pack_contig_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)373 __global__ void yaksuri_cudai_kernel_pack_contig_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
374 {
375     const char *__restrict__ sbuf = (const char *) inbuf;
376     char *__restrict__ dbuf = (char *) outbuf;
377     uintptr_t extent = md->extent;
378     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
379     uintptr_t res = idx;
380     uintptr_t inner_elements = md->num_elements;
381 
382     if (idx >= (count * inner_elements))
383         return;
384 
385     uintptr_t x0 = res / inner_elements;
386     res %= inner_elements;
387     inner_elements /= md->u.contig.count;
388 
389     uintptr_t x1 = res / inner_elements;
390     res %= inner_elements;
391     inner_elements /= md->u.contig.child->u.contig.count;
392 
393     uintptr_t x2 = res / inner_elements;
394     res %= inner_elements;
395     inner_elements /= md->u.contig.child->u.contig.child->u.contig.count;
396 
397     uintptr_t x3 = res;
398 
399     intptr_t stride1 = md->u.contig.child->extent;
400     intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
401     intptr_t stride3 = md->u.contig.child->u.contig.child->u.contig.child->extent;
402     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * stride2 + x3 * stride3));
403 }
404 
yaksuri_cudai_pack_contig_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)405 void yaksuri_cudai_pack_contig_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
406 {
407 void *args[] = { &inbuf, &outbuf, &count, &md };
408     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_contig_contig_float,
409         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
410     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
411 }
412 
yaksuri_cudai_kernel_unpack_contig_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)413 __global__ void yaksuri_cudai_kernel_unpack_contig_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
414 {
415     const char *__restrict__ sbuf = (const char *) inbuf;
416     char *__restrict__ dbuf = (char *) outbuf;
417     uintptr_t extent = md->extent;
418     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
419     uintptr_t res = idx;
420     uintptr_t inner_elements = md->num_elements;
421 
422     if (idx >= (count * inner_elements))
423         return;
424 
425     uintptr_t x0 = res / inner_elements;
426     res %= inner_elements;
427     inner_elements /= md->u.contig.count;
428 
429     uintptr_t x1 = res / inner_elements;
430     res %= inner_elements;
431     inner_elements /= md->u.contig.child->u.contig.count;
432 
433     uintptr_t x2 = res / inner_elements;
434     res %= inner_elements;
435     inner_elements /= md->u.contig.child->u.contig.child->u.contig.count;
436 
437     uintptr_t x3 = res;
438 
439     intptr_t stride1 = md->u.contig.child->extent;
440     intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
441     intptr_t stride3 = md->u.contig.child->u.contig.child->u.contig.child->extent;
442     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * stride2 + x3 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
443 }
444 
yaksuri_cudai_unpack_contig_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)445 void yaksuri_cudai_unpack_contig_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
446 {
447 void *args[] = { &inbuf, &outbuf, &count, &md };
448     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_contig_contig_float,
449         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
450     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
451 }
452 
yaksuri_cudai_kernel_pack_resized_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)453 __global__ void yaksuri_cudai_kernel_pack_resized_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
454 {
455     const char *__restrict__ sbuf = (const char *) inbuf;
456     char *__restrict__ dbuf = (char *) outbuf;
457     uintptr_t extent = md->extent;
458     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
459     uintptr_t res = idx;
460     uintptr_t inner_elements = md->num_elements;
461 
462     if (idx >= (count * inner_elements))
463         return;
464 
465     uintptr_t x0 = res / inner_elements;
466     res %= inner_elements;
467     inner_elements /= md->u.resized.child->u.contig.count;
468 
469     uintptr_t x1 = res / inner_elements;
470     res %= inner_elements;
471     inner_elements /= md->u.resized.child->u.contig.child->u.contig.count;
472 
473     uintptr_t x2 = res;
474 
475     intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
476     intptr_t stride3 = md->u.resized.child->u.contig.child->u.contig.child->extent;
477     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride2 + x2 * stride3));
478 }
479 
yaksuri_cudai_pack_resized_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)480 void yaksuri_cudai_pack_resized_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
481 {
482 void *args[] = { &inbuf, &outbuf, &count, &md };
483     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_contig_contig_float,
484         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
485     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
486 }
487 
yaksuri_cudai_kernel_unpack_resized_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)488 __global__ void yaksuri_cudai_kernel_unpack_resized_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
489 {
490     const char *__restrict__ sbuf = (const char *) inbuf;
491     char *__restrict__ dbuf = (char *) outbuf;
492     uintptr_t extent = md->extent;
493     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
494     uintptr_t res = idx;
495     uintptr_t inner_elements = md->num_elements;
496 
497     if (idx >= (count * inner_elements))
498         return;
499 
500     uintptr_t x0 = res / inner_elements;
501     res %= inner_elements;
502     inner_elements /= md->u.resized.child->u.contig.count;
503 
504     uintptr_t x1 = res / inner_elements;
505     res %= inner_elements;
506     inner_elements /= md->u.resized.child->u.contig.child->u.contig.count;
507 
508     uintptr_t x2 = res;
509 
510     intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
511     intptr_t stride3 = md->u.resized.child->u.contig.child->u.contig.child->extent;
512     *((float *) (void *) (dbuf + x0 * extent + x1 * stride2 + x2 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
513 }
514 
yaksuri_cudai_unpack_resized_contig_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)515 void yaksuri_cudai_unpack_resized_contig_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
516 {
517 void *args[] = { &inbuf, &outbuf, &count, &md };
518     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_contig_contig_float,
519         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
520     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
521 }
522 
523