1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.blkhindx.count;
32 
33     uintptr_t x1 = res / inner_elements;
34     res %= inner_elements;
35     inner_elements /= md->u.blkhindx.blocklength;
36     uintptr_t x2 = res / inner_elements;
37     res %= inner_elements;
38     inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
39 
40     uintptr_t x3 = res / inner_elements;
41     res %= inner_elements;
42     inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
43     uintptr_t x4 = res;
44 
45     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
46     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
47     uintptr_t extent2 = md->u.blkhindx.child->extent;
48     *((int32_t *) (void *) (dbuf + idx * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(int32_t)));
49 }
50 
yaksuri_cudai_pack_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)51 void yaksuri_cudai_pack_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
52 {
53 void *args[] = { &inbuf, &outbuf, &count, &md };
54     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_blkhindx_int32_t,
55         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
56     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
57 }
58 
yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)59 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
60 {
61     const char *__restrict__ sbuf = (const char *) inbuf;
62     char *__restrict__ dbuf = (char *) outbuf;
63     uintptr_t extent = md->extent;
64     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
65     uintptr_t res = idx;
66     uintptr_t inner_elements = md->num_elements;
67 
68     if (idx >= (count * inner_elements))
69         return;
70 
71     uintptr_t x0 = res / inner_elements;
72     res %= inner_elements;
73     inner_elements /= md->u.blkhindx.count;
74 
75     uintptr_t x1 = res / inner_elements;
76     res %= inner_elements;
77     inner_elements /= md->u.blkhindx.blocklength;
78     uintptr_t x2 = res / inner_elements;
79     res %= inner_elements;
80     inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
81 
82     uintptr_t x3 = res / inner_elements;
83     res %= inner_elements;
84     inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
85     uintptr_t x4 = res;
86 
87     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
88     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
89     uintptr_t extent2 = md->u.blkhindx.child->extent;
90     *((int32_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + idx * sizeof(int32_t)));
91 }
92 
yaksuri_cudai_unpack_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)93 void yaksuri_cudai_unpack_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
94 {
95 void *args[] = { &inbuf, &outbuf, &count, &md };
96     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_int32_t,
97         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
98     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
99 }
100 
yaksuri_cudai_kernel_pack_hvector_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)101 __global__ void yaksuri_cudai_kernel_pack_hvector_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
102 {
103     const char *__restrict__ sbuf = (const char *) inbuf;
104     char *__restrict__ dbuf = (char *) outbuf;
105     uintptr_t extent = md->extent;
106     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
107     uintptr_t res = idx;
108     uintptr_t inner_elements = md->num_elements;
109 
110     if (idx >= (count * inner_elements))
111         return;
112 
113     uintptr_t x0 = res / inner_elements;
114     res %= inner_elements;
115     inner_elements /= md->u.hvector.count;
116 
117     uintptr_t x1 = res / inner_elements;
118     res %= inner_elements;
119     inner_elements /= md->u.hvector.blocklength;
120     uintptr_t x2 = res / inner_elements;
121     res %= inner_elements;
122     inner_elements /= md->u.hvector.child->u.blkhindx.count;
123 
124     uintptr_t x3 = res / inner_elements;
125     res %= inner_elements;
126     inner_elements /= md->u.hvector.child->u.blkhindx.blocklength;
127     uintptr_t x4 = res / inner_elements;
128     res %= inner_elements;
129     inner_elements /= md->u.hvector.child->u.blkhindx.child->u.blkhindx.count;
130 
131     uintptr_t x5 = res / inner_elements;
132     res %= inner_elements;
133     inner_elements /= md->u.hvector.child->u.blkhindx.child->u.blkhindx.blocklength;
134     uintptr_t x6 = res;
135 
136     intptr_t stride1 = md->u.hvector.stride;
137     intptr_t *array_of_displs2 = md->u.hvector.child->u.blkhindx.array_of_displs;
138     uintptr_t extent2 = md->u.hvector.child->extent;
139     intptr_t *array_of_displs3 = md->u.hvector.child->u.blkhindx.child->u.blkhindx.array_of_displs;
140     uintptr_t extent3 = md->u.hvector.child->u.blkhindx.child->extent;
141     *((int32_t *) (void *) (dbuf + idx * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int32_t)));
142 }
143 
yaksuri_cudai_pack_hvector_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)144 void yaksuri_cudai_pack_hvector_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
145 {
146 void *args[] = { &inbuf, &outbuf, &count, &md };
147     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_blkhindx_blkhindx_int32_t,
148         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
149     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
150 }
151 
yaksuri_cudai_kernel_unpack_hvector_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)152 __global__ void yaksuri_cudai_kernel_unpack_hvector_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
153 {
154     const char *__restrict__ sbuf = (const char *) inbuf;
155     char *__restrict__ dbuf = (char *) outbuf;
156     uintptr_t extent = md->extent;
157     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
158     uintptr_t res = idx;
159     uintptr_t inner_elements = md->num_elements;
160 
161     if (idx >= (count * inner_elements))
162         return;
163 
164     uintptr_t x0 = res / inner_elements;
165     res %= inner_elements;
166     inner_elements /= md->u.hvector.count;
167 
168     uintptr_t x1 = res / inner_elements;
169     res %= inner_elements;
170     inner_elements /= md->u.hvector.blocklength;
171     uintptr_t x2 = res / inner_elements;
172     res %= inner_elements;
173     inner_elements /= md->u.hvector.child->u.blkhindx.count;
174 
175     uintptr_t x3 = res / inner_elements;
176     res %= inner_elements;
177     inner_elements /= md->u.hvector.child->u.blkhindx.blocklength;
178     uintptr_t x4 = res / inner_elements;
179     res %= inner_elements;
180     inner_elements /= md->u.hvector.child->u.blkhindx.child->u.blkhindx.count;
181 
182     uintptr_t x5 = res / inner_elements;
183     res %= inner_elements;
184     inner_elements /= md->u.hvector.child->u.blkhindx.child->u.blkhindx.blocklength;
185     uintptr_t x6 = res;
186 
187     intptr_t stride1 = md->u.hvector.stride;
188     intptr_t *array_of_displs2 = md->u.hvector.child->u.blkhindx.array_of_displs;
189     uintptr_t extent2 = md->u.hvector.child->extent;
190     intptr_t *array_of_displs3 = md->u.hvector.child->u.blkhindx.child->u.blkhindx.array_of_displs;
191     uintptr_t extent3 = md->u.hvector.child->u.blkhindx.child->extent;
192     *((int32_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + idx * sizeof(int32_t)));
193 }
194 
yaksuri_cudai_unpack_hvector_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)195 void yaksuri_cudai_unpack_hvector_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
196 {
197 void *args[] = { &inbuf, &outbuf, &count, &md };
198     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_blkhindx_blkhindx_int32_t,
199         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
200     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
201 }
202 
yaksuri_cudai_kernel_pack_blkhindx_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)203 __global__ void yaksuri_cudai_kernel_pack_blkhindx_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
204 {
205     const char *__restrict__ sbuf = (const char *) inbuf;
206     char *__restrict__ dbuf = (char *) outbuf;
207     uintptr_t extent = md->extent;
208     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
209     uintptr_t res = idx;
210     uintptr_t inner_elements = md->num_elements;
211 
212     if (idx >= (count * inner_elements))
213         return;
214 
215     uintptr_t x0 = res / inner_elements;
216     res %= inner_elements;
217     inner_elements /= md->u.blkhindx.count;
218 
219     uintptr_t x1 = res / inner_elements;
220     res %= inner_elements;
221     inner_elements /= md->u.blkhindx.blocklength;
222     uintptr_t x2 = res / inner_elements;
223     res %= inner_elements;
224     inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
225 
226     uintptr_t x3 = res / inner_elements;
227     res %= inner_elements;
228     inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
229     uintptr_t x4 = res / inner_elements;
230     res %= inner_elements;
231     inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.blkhindx.count;
232 
233     uintptr_t x5 = res / inner_elements;
234     res %= inner_elements;
235     inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.blkhindx.blocklength;
236     uintptr_t x6 = res;
237 
238     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
239     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
240     uintptr_t extent2 = md->u.blkhindx.child->extent;
241     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.blkhindx.child->u.blkhindx.array_of_displs;
242     uintptr_t extent3 = md->u.blkhindx.child->u.blkhindx.child->extent;
243     *((int32_t *) (void *) (dbuf + idx * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int32_t)));
244 }
245 
yaksuri_cudai_pack_blkhindx_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)246 void yaksuri_cudai_pack_blkhindx_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
247 {
248 void *args[] = { &inbuf, &outbuf, &count, &md };
249     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_blkhindx_blkhindx_int32_t,
250         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
251     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
252 }
253 
yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)254 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
255 {
256     const char *__restrict__ sbuf = (const char *) inbuf;
257     char *__restrict__ dbuf = (char *) outbuf;
258     uintptr_t extent = md->extent;
259     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
260     uintptr_t res = idx;
261     uintptr_t inner_elements = md->num_elements;
262 
263     if (idx >= (count * inner_elements))
264         return;
265 
266     uintptr_t x0 = res / inner_elements;
267     res %= inner_elements;
268     inner_elements /= md->u.blkhindx.count;
269 
270     uintptr_t x1 = res / inner_elements;
271     res %= inner_elements;
272     inner_elements /= md->u.blkhindx.blocklength;
273     uintptr_t x2 = res / inner_elements;
274     res %= inner_elements;
275     inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
276 
277     uintptr_t x3 = res / inner_elements;
278     res %= inner_elements;
279     inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
280     uintptr_t x4 = res / inner_elements;
281     res %= inner_elements;
282     inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.blkhindx.count;
283 
284     uintptr_t x5 = res / inner_elements;
285     res %= inner_elements;
286     inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.blkhindx.blocklength;
287     uintptr_t x6 = res;
288 
289     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
290     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
291     uintptr_t extent2 = md->u.blkhindx.child->extent;
292     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.blkhindx.child->u.blkhindx.array_of_displs;
293     uintptr_t extent3 = md->u.blkhindx.child->u.blkhindx.child->extent;
294     *((int32_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + idx * sizeof(int32_t)));
295 }
296 
yaksuri_cudai_unpack_blkhindx_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)297 void yaksuri_cudai_unpack_blkhindx_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
298 {
299 void *args[] = { &inbuf, &outbuf, &count, &md };
300     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_blkhindx_int32_t,
301         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
302     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
303 }
304 
yaksuri_cudai_kernel_pack_hindexed_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)305 __global__ void yaksuri_cudai_kernel_pack_hindexed_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
306 {
307     const char *__restrict__ sbuf = (const char *) inbuf;
308     char *__restrict__ dbuf = (char *) outbuf;
309     uintptr_t extent = md->extent;
310     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
311     uintptr_t res = idx;
312     uintptr_t inner_elements = md->num_elements;
313 
314     if (idx >= (count * inner_elements))
315         return;
316 
317     uintptr_t x0 = res / inner_elements;
318     res %= inner_elements;
319     inner_elements /= md->u.hindexed.count;
320 
321     uintptr_t x1;
322     for (int i = 0; i < md->u.hindexed.count; i++) {
323             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
324                                  md->u.hindexed.child->num_elements;
325             if (res < in_elems) {
326                     x1 = i;
327                     res %= in_elems;
328                     inner_elements = md->u.hindexed.child->num_elements;
329                     break;
330             } else {
331                     res -= in_elems;
332             }
333     }
334 
335     uintptr_t x2 = res / inner_elements;
336     res %= inner_elements;
337     inner_elements /= md->u.hindexed.child->u.blkhindx.count;
338 
339     uintptr_t x3 = res / inner_elements;
340     res %= inner_elements;
341     inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
342     uintptr_t x4 = res / inner_elements;
343     res %= inner_elements;
344     inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.blkhindx.count;
345 
346     uintptr_t x5 = res / inner_elements;
347     res %= inner_elements;
348     inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.blkhindx.blocklength;
349     uintptr_t x6 = res;
350 
351     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
352     intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
353     uintptr_t extent2 = md->u.hindexed.child->extent;
354     intptr_t *array_of_displs3 = md->u.hindexed.child->u.blkhindx.child->u.blkhindx.array_of_displs;
355     uintptr_t extent3 = md->u.hindexed.child->u.blkhindx.child->extent;
356     *((int32_t *) (void *) (dbuf + idx * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int32_t)));
357 }
358 
yaksuri_cudai_pack_hindexed_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)359 void yaksuri_cudai_pack_hindexed_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
360 {
361 void *args[] = { &inbuf, &outbuf, &count, &md };
362     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_blkhindx_blkhindx_int32_t,
363         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
364     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
365 }
366 
yaksuri_cudai_kernel_unpack_hindexed_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)367 __global__ void yaksuri_cudai_kernel_unpack_hindexed_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
368 {
369     const char *__restrict__ sbuf = (const char *) inbuf;
370     char *__restrict__ dbuf = (char *) outbuf;
371     uintptr_t extent = md->extent;
372     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
373     uintptr_t res = idx;
374     uintptr_t inner_elements = md->num_elements;
375 
376     if (idx >= (count * inner_elements))
377         return;
378 
379     uintptr_t x0 = res / inner_elements;
380     res %= inner_elements;
381     inner_elements /= md->u.hindexed.count;
382 
383     uintptr_t x1;
384     for (int i = 0; i < md->u.hindexed.count; i++) {
385             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
386                                  md->u.hindexed.child->num_elements;
387             if (res < in_elems) {
388                     x1 = i;
389                     res %= in_elems;
390                     inner_elements = md->u.hindexed.child->num_elements;
391                     break;
392             } else {
393                     res -= in_elems;
394             }
395     }
396 
397     uintptr_t x2 = res / inner_elements;
398     res %= inner_elements;
399     inner_elements /= md->u.hindexed.child->u.blkhindx.count;
400 
401     uintptr_t x3 = res / inner_elements;
402     res %= inner_elements;
403     inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
404     uintptr_t x4 = res / inner_elements;
405     res %= inner_elements;
406     inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.blkhindx.count;
407 
408     uintptr_t x5 = res / inner_elements;
409     res %= inner_elements;
410     inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.blkhindx.blocklength;
411     uintptr_t x6 = res;
412 
413     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
414     intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
415     uintptr_t extent2 = md->u.hindexed.child->extent;
416     intptr_t *array_of_displs3 = md->u.hindexed.child->u.blkhindx.child->u.blkhindx.array_of_displs;
417     uintptr_t extent3 = md->u.hindexed.child->u.blkhindx.child->extent;
418     *((int32_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + idx * sizeof(int32_t)));
419 }
420 
yaksuri_cudai_unpack_hindexed_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)421 void yaksuri_cudai_unpack_hindexed_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
422 {
423 void *args[] = { &inbuf, &outbuf, &count, &md };
424     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_blkhindx_blkhindx_int32_t,
425         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
426     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
427 }
428 
yaksuri_cudai_kernel_pack_contig_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)429 __global__ void yaksuri_cudai_kernel_pack_contig_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
430 {
431     const char *__restrict__ sbuf = (const char *) inbuf;
432     char *__restrict__ dbuf = (char *) outbuf;
433     uintptr_t extent = md->extent;
434     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
435     uintptr_t res = idx;
436     uintptr_t inner_elements = md->num_elements;
437 
438     if (idx >= (count * inner_elements))
439         return;
440 
441     uintptr_t x0 = res / inner_elements;
442     res %= inner_elements;
443     inner_elements /= md->u.contig.count;
444 
445     uintptr_t x1 = res / inner_elements;
446     res %= inner_elements;
447     inner_elements /= md->u.contig.child->u.blkhindx.count;
448 
449     uintptr_t x2 = res / inner_elements;
450     res %= inner_elements;
451     inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
452     uintptr_t x3 = res / inner_elements;
453     res %= inner_elements;
454     inner_elements /= md->u.contig.child->u.blkhindx.child->u.blkhindx.count;
455 
456     uintptr_t x4 = res / inner_elements;
457     res %= inner_elements;
458     inner_elements /= md->u.contig.child->u.blkhindx.child->u.blkhindx.blocklength;
459     uintptr_t x5 = res;
460 
461     intptr_t stride1 = md->u.contig.child->extent;
462     intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
463     intptr_t *array_of_displs3 = md->u.contig.child->u.blkhindx.child->u.blkhindx.array_of_displs;
464     uintptr_t extent3 = md->u.contig.child->u.blkhindx.child->extent;
465     *((int32_t *) (void *) (dbuf + idx * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(int32_t)));
466 }
467 
yaksuri_cudai_pack_contig_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)468 void yaksuri_cudai_pack_contig_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
469 {
470 void *args[] = { &inbuf, &outbuf, &count, &md };
471     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_blkhindx_blkhindx_int32_t,
472         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
473     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
474 }
475 
yaksuri_cudai_kernel_unpack_contig_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)476 __global__ void yaksuri_cudai_kernel_unpack_contig_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
477 {
478     const char *__restrict__ sbuf = (const char *) inbuf;
479     char *__restrict__ dbuf = (char *) outbuf;
480     uintptr_t extent = md->extent;
481     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
482     uintptr_t res = idx;
483     uintptr_t inner_elements = md->num_elements;
484 
485     if (idx >= (count * inner_elements))
486         return;
487 
488     uintptr_t x0 = res / inner_elements;
489     res %= inner_elements;
490     inner_elements /= md->u.contig.count;
491 
492     uintptr_t x1 = res / inner_elements;
493     res %= inner_elements;
494     inner_elements /= md->u.contig.child->u.blkhindx.count;
495 
496     uintptr_t x2 = res / inner_elements;
497     res %= inner_elements;
498     inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
499     uintptr_t x3 = res / inner_elements;
500     res %= inner_elements;
501     inner_elements /= md->u.contig.child->u.blkhindx.child->u.blkhindx.count;
502 
503     uintptr_t x4 = res / inner_elements;
504     res %= inner_elements;
505     inner_elements /= md->u.contig.child->u.blkhindx.child->u.blkhindx.blocklength;
506     uintptr_t x5 = res;
507 
508     intptr_t stride1 = md->u.contig.child->extent;
509     intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
510     intptr_t *array_of_displs3 = md->u.contig.child->u.blkhindx.child->u.blkhindx.array_of_displs;
511     uintptr_t extent3 = md->u.contig.child->u.blkhindx.child->extent;
512     *((int32_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + idx * sizeof(int32_t)));
513 }
514 
yaksuri_cudai_unpack_contig_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)515 void yaksuri_cudai_unpack_contig_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
516 {
517 void *args[] = { &inbuf, &outbuf, &count, &md };
518     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_blkhindx_blkhindx_int32_t,
519         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
520     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
521 }
522 
yaksuri_cudai_kernel_pack_resized_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)523 __global__ void yaksuri_cudai_kernel_pack_resized_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
524 {
525     const char *__restrict__ sbuf = (const char *) inbuf;
526     char *__restrict__ dbuf = (char *) outbuf;
527     uintptr_t extent = md->extent;
528     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
529     uintptr_t res = idx;
530     uintptr_t inner_elements = md->num_elements;
531 
532     if (idx >= (count * inner_elements))
533         return;
534 
535     uintptr_t x0 = res / inner_elements;
536     res %= inner_elements;
537     inner_elements /= md->u.resized.child->u.blkhindx.count;
538 
539     uintptr_t x1 = res / inner_elements;
540     res %= inner_elements;
541     inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
542     uintptr_t x2 = res / inner_elements;
543     res %= inner_elements;
544     inner_elements /= md->u.resized.child->u.blkhindx.child->u.blkhindx.count;
545 
546     uintptr_t x3 = res / inner_elements;
547     res %= inner_elements;
548     inner_elements /= md->u.resized.child->u.blkhindx.child->u.blkhindx.blocklength;
549     uintptr_t x4 = res;
550 
551     intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
552     intptr_t *array_of_displs3 = md->u.resized.child->u.blkhindx.child->u.blkhindx.array_of_displs;
553     uintptr_t extent3 = md->u.resized.child->u.blkhindx.child->extent;
554     *((int32_t *) (void *) (dbuf + idx * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(int32_t)));
555 }
556 
yaksuri_cudai_pack_resized_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)557 void yaksuri_cudai_pack_resized_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
558 {
559 void *args[] = { &inbuf, &outbuf, &count, &md };
560     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_blkhindx_blkhindx_int32_t,
561         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
562     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
563 }
564 
yaksuri_cudai_kernel_unpack_resized_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)565 __global__ void yaksuri_cudai_kernel_unpack_resized_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
566 {
567     const char *__restrict__ sbuf = (const char *) inbuf;
568     char *__restrict__ dbuf = (char *) outbuf;
569     uintptr_t extent = md->extent;
570     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
571     uintptr_t res = idx;
572     uintptr_t inner_elements = md->num_elements;
573 
574     if (idx >= (count * inner_elements))
575         return;
576 
577     uintptr_t x0 = res / inner_elements;
578     res %= inner_elements;
579     inner_elements /= md->u.resized.child->u.blkhindx.count;
580 
581     uintptr_t x1 = res / inner_elements;
582     res %= inner_elements;
583     inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
584     uintptr_t x2 = res / inner_elements;
585     res %= inner_elements;
586     inner_elements /= md->u.resized.child->u.blkhindx.child->u.blkhindx.count;
587 
588     uintptr_t x3 = res / inner_elements;
589     res %= inner_elements;
590     inner_elements /= md->u.resized.child->u.blkhindx.child->u.blkhindx.blocklength;
591     uintptr_t x4 = res;
592 
593     intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
594     intptr_t *array_of_displs3 = md->u.resized.child->u.blkhindx.child->u.blkhindx.array_of_displs;
595     uintptr_t extent3 = md->u.resized.child->u.blkhindx.child->extent;
596     *((int32_t *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(int32_t))) = *((const int32_t *) (const void *) (sbuf + idx * sizeof(int32_t)));
597 }
598 
yaksuri_cudai_unpack_resized_blkhindx_blkhindx_int32_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)599 void yaksuri_cudai_unpack_resized_blkhindx_blkhindx_int32_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
600 {
601 void *args[] = { &inbuf, &outbuf, &count, &md };
602     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_blkhindx_blkhindx_int32_t,
603         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
604     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
605 }
606 
607