1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.hindexed.count;
32 
33     uintptr_t x1;
34     for (int i = 0; i < md->u.hindexed.count; i++) {
35             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
36                                  md->u.hindexed.child->num_elements;
37             if (res < in_elems) {
38                     x1 = i;
39                     res %= in_elems;
40                     inner_elements = md->u.hindexed.child->num_elements;
41                     break;
42             } else {
43                     res -= in_elems;
44             }
45     }
46 
47     uintptr_t x2 = res / inner_elements;
48     res %= inner_elements;
49     inner_elements /= md->u.hindexed.child->u.blkhindx.count;
50 
51     uintptr_t x3 = res / inner_elements;
52     res %= inner_elements;
53     inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
54     uintptr_t x4 = res;
55 
56     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
57     intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
58     uintptr_t extent2 = md->u.hindexed.child->extent;
59     *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(double)));
60 }
61 
yaksuri_cudai_pack_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)62 void yaksuri_cudai_pack_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
63 {
64 void *args[] = { &inbuf, &outbuf, &count, &md };
65     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_blkhindx_double,
66         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
67     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
68 }
69 
yaksuri_cudai_kernel_unpack_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)70 __global__ void yaksuri_cudai_kernel_unpack_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
71 {
72     const char *__restrict__ sbuf = (const char *) inbuf;
73     char *__restrict__ dbuf = (char *) outbuf;
74     uintptr_t extent = md->extent;
75     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
76     uintptr_t res = idx;
77     uintptr_t inner_elements = md->num_elements;
78 
79     if (idx >= (count * inner_elements))
80         return;
81 
82     uintptr_t x0 = res / inner_elements;
83     res %= inner_elements;
84     inner_elements /= md->u.hindexed.count;
85 
86     uintptr_t x1;
87     for (int i = 0; i < md->u.hindexed.count; i++) {
88             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
89                                  md->u.hindexed.child->num_elements;
90             if (res < in_elems) {
91                     x1 = i;
92                     res %= in_elems;
93                     inner_elements = md->u.hindexed.child->num_elements;
94                     break;
95             } else {
96                     res -= in_elems;
97             }
98     }
99 
100     uintptr_t x2 = res / inner_elements;
101     res %= inner_elements;
102     inner_elements /= md->u.hindexed.child->u.blkhindx.count;
103 
104     uintptr_t x3 = res / inner_elements;
105     res %= inner_elements;
106     inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
107     uintptr_t x4 = res;
108 
109     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
110     intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
111     uintptr_t extent2 = md->u.hindexed.child->extent;
112     *((double *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(double))) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
113 }
114 
yaksuri_cudai_unpack_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)115 void yaksuri_cudai_unpack_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
116 {
117 void *args[] = { &inbuf, &outbuf, &count, &md };
118     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_blkhindx_double,
119         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
120     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
121 }
122 
yaksuri_cudai_kernel_pack_hvector_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)123 __global__ void yaksuri_cudai_kernel_pack_hvector_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
124 {
125     const char *__restrict__ sbuf = (const char *) inbuf;
126     char *__restrict__ dbuf = (char *) outbuf;
127     uintptr_t extent = md->extent;
128     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
129     uintptr_t res = idx;
130     uintptr_t inner_elements = md->num_elements;
131 
132     if (idx >= (count * inner_elements))
133         return;
134 
135     uintptr_t x0 = res / inner_elements;
136     res %= inner_elements;
137     inner_elements /= md->u.hvector.count;
138 
139     uintptr_t x1 = res / inner_elements;
140     res %= inner_elements;
141     inner_elements /= md->u.hvector.blocklength;
142     uintptr_t x2 = res / inner_elements;
143     res %= inner_elements;
144     inner_elements /= md->u.hvector.child->u.hindexed.count;
145 
146     uintptr_t x3;
147     for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
148             uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
149                                  md->u.hvector.child->u.hindexed.child->num_elements;
150             if (res < in_elems) {
151                     x3 = i;
152                     res %= in_elems;
153                     inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
154                     break;
155             } else {
156                     res -= in_elems;
157             }
158     }
159 
160     uintptr_t x4 = res / inner_elements;
161     res %= inner_elements;
162     inner_elements /= md->u.hvector.child->u.hindexed.child->u.blkhindx.count;
163 
164     uintptr_t x5 = res / inner_elements;
165     res %= inner_elements;
166     inner_elements /= md->u.hvector.child->u.hindexed.child->u.blkhindx.blocklength;
167     uintptr_t x6 = res;
168 
169     intptr_t stride1 = md->u.hvector.stride;
170     intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
171     uintptr_t extent2 = md->u.hvector.child->extent;
172     intptr_t *array_of_displs3 = md->u.hvector.child->u.hindexed.child->u.blkhindx.array_of_displs;
173     uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
174     *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(double)));
175 }
176 
yaksuri_cudai_pack_hvector_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)177 void yaksuri_cudai_pack_hvector_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
178 {
179 void *args[] = { &inbuf, &outbuf, &count, &md };
180     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_hindexed_blkhindx_double,
181         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
182     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
183 }
184 
yaksuri_cudai_kernel_unpack_hvector_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)185 __global__ void yaksuri_cudai_kernel_unpack_hvector_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
186 {
187     const char *__restrict__ sbuf = (const char *) inbuf;
188     char *__restrict__ dbuf = (char *) outbuf;
189     uintptr_t extent = md->extent;
190     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
191     uintptr_t res = idx;
192     uintptr_t inner_elements = md->num_elements;
193 
194     if (idx >= (count * inner_elements))
195         return;
196 
197     uintptr_t x0 = res / inner_elements;
198     res %= inner_elements;
199     inner_elements /= md->u.hvector.count;
200 
201     uintptr_t x1 = res / inner_elements;
202     res %= inner_elements;
203     inner_elements /= md->u.hvector.blocklength;
204     uintptr_t x2 = res / inner_elements;
205     res %= inner_elements;
206     inner_elements /= md->u.hvector.child->u.hindexed.count;
207 
208     uintptr_t x3;
209     for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
210             uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
211                                  md->u.hvector.child->u.hindexed.child->num_elements;
212             if (res < in_elems) {
213                     x3 = i;
214                     res %= in_elems;
215                     inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
216                     break;
217             } else {
218                     res -= in_elems;
219             }
220     }
221 
222     uintptr_t x4 = res / inner_elements;
223     res %= inner_elements;
224     inner_elements /= md->u.hvector.child->u.hindexed.child->u.blkhindx.count;
225 
226     uintptr_t x5 = res / inner_elements;
227     res %= inner_elements;
228     inner_elements /= md->u.hvector.child->u.hindexed.child->u.blkhindx.blocklength;
229     uintptr_t x6 = res;
230 
231     intptr_t stride1 = md->u.hvector.stride;
232     intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
233     uintptr_t extent2 = md->u.hvector.child->extent;
234     intptr_t *array_of_displs3 = md->u.hvector.child->u.hindexed.child->u.blkhindx.array_of_displs;
235     uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
236     *((double *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(double))) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
237 }
238 
yaksuri_cudai_unpack_hvector_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)239 void yaksuri_cudai_unpack_hvector_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
240 {
241 void *args[] = { &inbuf, &outbuf, &count, &md };
242     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_hindexed_blkhindx_double,
243         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
244     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
245 }
246 
yaksuri_cudai_kernel_pack_blkhindx_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)247 __global__ void yaksuri_cudai_kernel_pack_blkhindx_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
248 {
249     const char *__restrict__ sbuf = (const char *) inbuf;
250     char *__restrict__ dbuf = (char *) outbuf;
251     uintptr_t extent = md->extent;
252     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
253     uintptr_t res = idx;
254     uintptr_t inner_elements = md->num_elements;
255 
256     if (idx >= (count * inner_elements))
257         return;
258 
259     uintptr_t x0 = res / inner_elements;
260     res %= inner_elements;
261     inner_elements /= md->u.blkhindx.count;
262 
263     uintptr_t x1 = res / inner_elements;
264     res %= inner_elements;
265     inner_elements /= md->u.blkhindx.blocklength;
266     uintptr_t x2 = res / inner_elements;
267     res %= inner_elements;
268     inner_elements /= md->u.blkhindx.child->u.hindexed.count;
269 
270     uintptr_t x3;
271     for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
272             uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
273                                  md->u.blkhindx.child->u.hindexed.child->num_elements;
274             if (res < in_elems) {
275                     x3 = i;
276                     res %= in_elems;
277                     inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
278                     break;
279             } else {
280                     res -= in_elems;
281             }
282     }
283 
284     uintptr_t x4 = res / inner_elements;
285     res %= inner_elements;
286     inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.blkhindx.count;
287 
288     uintptr_t x5 = res / inner_elements;
289     res %= inner_elements;
290     inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.blkhindx.blocklength;
291     uintptr_t x6 = res;
292 
293     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
294     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
295     uintptr_t extent2 = md->u.blkhindx.child->extent;
296     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.hindexed.child->u.blkhindx.array_of_displs;
297     uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
298     *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(double)));
299 }
300 
yaksuri_cudai_pack_blkhindx_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)301 void yaksuri_cudai_pack_blkhindx_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
302 {
303 void *args[] = { &inbuf, &outbuf, &count, &md };
304     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_hindexed_blkhindx_double,
305         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
306     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
307 }
308 
yaksuri_cudai_kernel_unpack_blkhindx_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)309 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
310 {
311     const char *__restrict__ sbuf = (const char *) inbuf;
312     char *__restrict__ dbuf = (char *) outbuf;
313     uintptr_t extent = md->extent;
314     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
315     uintptr_t res = idx;
316     uintptr_t inner_elements = md->num_elements;
317 
318     if (idx >= (count * inner_elements))
319         return;
320 
321     uintptr_t x0 = res / inner_elements;
322     res %= inner_elements;
323     inner_elements /= md->u.blkhindx.count;
324 
325     uintptr_t x1 = res / inner_elements;
326     res %= inner_elements;
327     inner_elements /= md->u.blkhindx.blocklength;
328     uintptr_t x2 = res / inner_elements;
329     res %= inner_elements;
330     inner_elements /= md->u.blkhindx.child->u.hindexed.count;
331 
332     uintptr_t x3;
333     for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
334             uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
335                                  md->u.blkhindx.child->u.hindexed.child->num_elements;
336             if (res < in_elems) {
337                     x3 = i;
338                     res %= in_elems;
339                     inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
340                     break;
341             } else {
342                     res -= in_elems;
343             }
344     }
345 
346     uintptr_t x4 = res / inner_elements;
347     res %= inner_elements;
348     inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.blkhindx.count;
349 
350     uintptr_t x5 = res / inner_elements;
351     res %= inner_elements;
352     inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.blkhindx.blocklength;
353     uintptr_t x6 = res;
354 
355     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
356     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
357     uintptr_t extent2 = md->u.blkhindx.child->extent;
358     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.hindexed.child->u.blkhindx.array_of_displs;
359     uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
360     *((double *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(double))) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
361 }
362 
yaksuri_cudai_unpack_blkhindx_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)363 void yaksuri_cudai_unpack_blkhindx_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
364 {
365 void *args[] = { &inbuf, &outbuf, &count, &md };
366     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_hindexed_blkhindx_double,
367         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
368     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
369 }
370 
yaksuri_cudai_kernel_pack_hindexed_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)371 __global__ void yaksuri_cudai_kernel_pack_hindexed_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
372 {
373     const char *__restrict__ sbuf = (const char *) inbuf;
374     char *__restrict__ dbuf = (char *) outbuf;
375     uintptr_t extent = md->extent;
376     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
377     uintptr_t res = idx;
378     uintptr_t inner_elements = md->num_elements;
379 
380     if (idx >= (count * inner_elements))
381         return;
382 
383     uintptr_t x0 = res / inner_elements;
384     res %= inner_elements;
385     inner_elements /= md->u.hindexed.count;
386 
387     uintptr_t x1;
388     for (int i = 0; i < md->u.hindexed.count; i++) {
389             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
390                                  md->u.hindexed.child->num_elements;
391             if (res < in_elems) {
392                     x1 = i;
393                     res %= in_elems;
394                     inner_elements = md->u.hindexed.child->num_elements;
395                     break;
396             } else {
397                     res -= in_elems;
398             }
399     }
400 
401     uintptr_t x2 = res / inner_elements;
402     res %= inner_elements;
403     inner_elements /= md->u.hindexed.child->u.hindexed.count;
404 
405     uintptr_t x3;
406     for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
407             uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
408                                  md->u.hindexed.child->u.hindexed.child->num_elements;
409             if (res < in_elems) {
410                     x3 = i;
411                     res %= in_elems;
412                     inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
413                     break;
414             } else {
415                     res -= in_elems;
416             }
417     }
418 
419     uintptr_t x4 = res / inner_elements;
420     res %= inner_elements;
421     inner_elements /= md->u.hindexed.child->u.hindexed.child->u.blkhindx.count;
422 
423     uintptr_t x5 = res / inner_elements;
424     res %= inner_elements;
425     inner_elements /= md->u.hindexed.child->u.hindexed.child->u.blkhindx.blocklength;
426     uintptr_t x6 = res;
427 
428     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
429     intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
430     uintptr_t extent2 = md->u.hindexed.child->extent;
431     intptr_t *array_of_displs3 = md->u.hindexed.child->u.hindexed.child->u.blkhindx.array_of_displs;
432     uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
433     *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(double)));
434 }
435 
yaksuri_cudai_pack_hindexed_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)436 void yaksuri_cudai_pack_hindexed_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
437 {
438 void *args[] = { &inbuf, &outbuf, &count, &md };
439     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_hindexed_blkhindx_double,
440         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
441     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
442 }
443 
yaksuri_cudai_kernel_unpack_hindexed_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)444 __global__ void yaksuri_cudai_kernel_unpack_hindexed_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
445 {
446     const char *__restrict__ sbuf = (const char *) inbuf;
447     char *__restrict__ dbuf = (char *) outbuf;
448     uintptr_t extent = md->extent;
449     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
450     uintptr_t res = idx;
451     uintptr_t inner_elements = md->num_elements;
452 
453     if (idx >= (count * inner_elements))
454         return;
455 
456     uintptr_t x0 = res / inner_elements;
457     res %= inner_elements;
458     inner_elements /= md->u.hindexed.count;
459 
460     uintptr_t x1;
461     for (int i = 0; i < md->u.hindexed.count; i++) {
462             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
463                                  md->u.hindexed.child->num_elements;
464             if (res < in_elems) {
465                     x1 = i;
466                     res %= in_elems;
467                     inner_elements = md->u.hindexed.child->num_elements;
468                     break;
469             } else {
470                     res -= in_elems;
471             }
472     }
473 
474     uintptr_t x2 = res / inner_elements;
475     res %= inner_elements;
476     inner_elements /= md->u.hindexed.child->u.hindexed.count;
477 
478     uintptr_t x3;
479     for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
480             uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
481                                  md->u.hindexed.child->u.hindexed.child->num_elements;
482             if (res < in_elems) {
483                     x3 = i;
484                     res %= in_elems;
485                     inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
486                     break;
487             } else {
488                     res -= in_elems;
489             }
490     }
491 
492     uintptr_t x4 = res / inner_elements;
493     res %= inner_elements;
494     inner_elements /= md->u.hindexed.child->u.hindexed.child->u.blkhindx.count;
495 
496     uintptr_t x5 = res / inner_elements;
497     res %= inner_elements;
498     inner_elements /= md->u.hindexed.child->u.hindexed.child->u.blkhindx.blocklength;
499     uintptr_t x6 = res;
500 
501     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
502     intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
503     uintptr_t extent2 = md->u.hindexed.child->extent;
504     intptr_t *array_of_displs3 = md->u.hindexed.child->u.hindexed.child->u.blkhindx.array_of_displs;
505     uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
506     *((double *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(double))) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
507 }
508 
yaksuri_cudai_unpack_hindexed_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)509 void yaksuri_cudai_unpack_hindexed_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
510 {
511 void *args[] = { &inbuf, &outbuf, &count, &md };
512     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_hindexed_blkhindx_double,
513         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
514     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
515 }
516 
yaksuri_cudai_kernel_pack_contig_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)517 __global__ void yaksuri_cudai_kernel_pack_contig_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
518 {
519     const char *__restrict__ sbuf = (const char *) inbuf;
520     char *__restrict__ dbuf = (char *) outbuf;
521     uintptr_t extent = md->extent;
522     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
523     uintptr_t res = idx;
524     uintptr_t inner_elements = md->num_elements;
525 
526     if (idx >= (count * inner_elements))
527         return;
528 
529     uintptr_t x0 = res / inner_elements;
530     res %= inner_elements;
531     inner_elements /= md->u.contig.count;
532 
533     uintptr_t x1 = res / inner_elements;
534     res %= inner_elements;
535     inner_elements /= md->u.contig.child->u.hindexed.count;
536 
537     uintptr_t x2;
538     for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
539             uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
540                                  md->u.contig.child->u.hindexed.child->num_elements;
541             if (res < in_elems) {
542                     x2 = i;
543                     res %= in_elems;
544                     inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
545                     break;
546             } else {
547                     res -= in_elems;
548             }
549     }
550 
551     uintptr_t x3 = res / inner_elements;
552     res %= inner_elements;
553     inner_elements /= md->u.contig.child->u.hindexed.child->u.blkhindx.count;
554 
555     uintptr_t x4 = res / inner_elements;
556     res %= inner_elements;
557     inner_elements /= md->u.contig.child->u.hindexed.child->u.blkhindx.blocklength;
558     uintptr_t x5 = res;
559 
560     intptr_t stride1 = md->u.contig.child->extent;
561     intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
562     intptr_t *array_of_displs3 = md->u.contig.child->u.hindexed.child->u.blkhindx.array_of_displs;
563     uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
564     *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(double)));
565 }
566 
yaksuri_cudai_pack_contig_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)567 void yaksuri_cudai_pack_contig_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
568 {
569 void *args[] = { &inbuf, &outbuf, &count, &md };
570     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_hindexed_blkhindx_double,
571         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
572     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
573 }
574 
yaksuri_cudai_kernel_unpack_contig_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)575 __global__ void yaksuri_cudai_kernel_unpack_contig_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
576 {
577     const char *__restrict__ sbuf = (const char *) inbuf;
578     char *__restrict__ dbuf = (char *) outbuf;
579     uintptr_t extent = md->extent;
580     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
581     uintptr_t res = idx;
582     uintptr_t inner_elements = md->num_elements;
583 
584     if (idx >= (count * inner_elements))
585         return;
586 
587     uintptr_t x0 = res / inner_elements;
588     res %= inner_elements;
589     inner_elements /= md->u.contig.count;
590 
591     uintptr_t x1 = res / inner_elements;
592     res %= inner_elements;
593     inner_elements /= md->u.contig.child->u.hindexed.count;
594 
595     uintptr_t x2;
596     for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
597             uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
598                                  md->u.contig.child->u.hindexed.child->num_elements;
599             if (res < in_elems) {
600                     x2 = i;
601                     res %= in_elems;
602                     inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
603                     break;
604             } else {
605                     res -= in_elems;
606             }
607     }
608 
609     uintptr_t x3 = res / inner_elements;
610     res %= inner_elements;
611     inner_elements /= md->u.contig.child->u.hindexed.child->u.blkhindx.count;
612 
613     uintptr_t x4 = res / inner_elements;
614     res %= inner_elements;
615     inner_elements /= md->u.contig.child->u.hindexed.child->u.blkhindx.blocklength;
616     uintptr_t x5 = res;
617 
618     intptr_t stride1 = md->u.contig.child->extent;
619     intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
620     intptr_t *array_of_displs3 = md->u.contig.child->u.hindexed.child->u.blkhindx.array_of_displs;
621     uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
622     *((double *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(double))) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
623 }
624 
yaksuri_cudai_unpack_contig_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)625 void yaksuri_cudai_unpack_contig_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
626 {
627 void *args[] = { &inbuf, &outbuf, &count, &md };
628     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_hindexed_blkhindx_double,
629         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
630     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
631 }
632 
yaksuri_cudai_kernel_pack_resized_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)633 __global__ void yaksuri_cudai_kernel_pack_resized_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
634 {
635     const char *__restrict__ sbuf = (const char *) inbuf;
636     char *__restrict__ dbuf = (char *) outbuf;
637     uintptr_t extent = md->extent;
638     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
639     uintptr_t res = idx;
640     uintptr_t inner_elements = md->num_elements;
641 
642     if (idx >= (count * inner_elements))
643         return;
644 
645     uintptr_t x0 = res / inner_elements;
646     res %= inner_elements;
647     inner_elements /= md->u.resized.child->u.hindexed.count;
648 
649     uintptr_t x1;
650     for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
651             uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
652                                  md->u.resized.child->u.hindexed.child->num_elements;
653             if (res < in_elems) {
654                     x1 = i;
655                     res %= in_elems;
656                     inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
657                     break;
658             } else {
659                     res -= in_elems;
660             }
661     }
662 
663     uintptr_t x2 = res / inner_elements;
664     res %= inner_elements;
665     inner_elements /= md->u.resized.child->u.hindexed.child->u.blkhindx.count;
666 
667     uintptr_t x3 = res / inner_elements;
668     res %= inner_elements;
669     inner_elements /= md->u.resized.child->u.hindexed.child->u.blkhindx.blocklength;
670     uintptr_t x4 = res;
671 
672     intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
673     intptr_t *array_of_displs3 = md->u.resized.child->u.hindexed.child->u.blkhindx.array_of_displs;
674     uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
675     *((double *) (void *) (dbuf + idx * sizeof(double))) = *((const double *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(double)));
676 }
677 
yaksuri_cudai_pack_resized_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)678 void yaksuri_cudai_pack_resized_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
679 {
680 void *args[] = { &inbuf, &outbuf, &count, &md };
681     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_hindexed_blkhindx_double,
682         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
683     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
684 }
685 
yaksuri_cudai_kernel_unpack_resized_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)686 __global__ void yaksuri_cudai_kernel_unpack_resized_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
687 {
688     const char *__restrict__ sbuf = (const char *) inbuf;
689     char *__restrict__ dbuf = (char *) outbuf;
690     uintptr_t extent = md->extent;
691     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
692     uintptr_t res = idx;
693     uintptr_t inner_elements = md->num_elements;
694 
695     if (idx >= (count * inner_elements))
696         return;
697 
698     uintptr_t x0 = res / inner_elements;
699     res %= inner_elements;
700     inner_elements /= md->u.resized.child->u.hindexed.count;
701 
702     uintptr_t x1;
703     for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
704             uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
705                                  md->u.resized.child->u.hindexed.child->num_elements;
706             if (res < in_elems) {
707                     x1 = i;
708                     res %= in_elems;
709                     inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
710                     break;
711             } else {
712                     res -= in_elems;
713             }
714     }
715 
716     uintptr_t x2 = res / inner_elements;
717     res %= inner_elements;
718     inner_elements /= md->u.resized.child->u.hindexed.child->u.blkhindx.count;
719 
720     uintptr_t x3 = res / inner_elements;
721     res %= inner_elements;
722     inner_elements /= md->u.resized.child->u.hindexed.child->u.blkhindx.blocklength;
723     uintptr_t x4 = res;
724 
725     intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
726     intptr_t *array_of_displs3 = md->u.resized.child->u.hindexed.child->u.blkhindx.array_of_displs;
727     uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
728     *((double *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(double))) = *((const double *) (const void *) (sbuf + idx * sizeof(double)));
729 }
730 
yaksuri_cudai_unpack_resized_hindexed_blkhindx_double(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)731 void yaksuri_cudai_unpack_resized_hindexed_blkhindx_double(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
732 {
733 void *args[] = { &inbuf, &outbuf, &count, &md };
734     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_hindexed_blkhindx_double,
735         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
736     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
737 }
738 
739