1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.hvector.count;
32 
33     uintptr_t x1 = res / inner_elements;
34     res %= inner_elements;
35     inner_elements /= md->u.hvector.blocklength;
36     uintptr_t x2 = res / inner_elements;
37     res %= inner_elements;
38     inner_elements /= md->u.hvector.child->u.hindexed.count;
39 
40     uintptr_t x3;
41     for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
42             uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
43                                  md->u.hvector.child->u.hindexed.child->num_elements;
44             if (res < in_elems) {
45                     x3 = i;
46                     res %= in_elems;
47                     inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
48                     break;
49             } else {
50                     res -= in_elems;
51             }
52     }
53 
54     uintptr_t x4 = res;
55 
56     intptr_t stride1 = md->u.hvector.stride;
57     intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
58     uintptr_t extent2 = md->u.hvector.child->extent;
59     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(float)));
60 }
61 
yaksuri_cudai_pack_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)62 void yaksuri_cudai_pack_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
63 {
64 void *args[] = { &inbuf, &outbuf, &count, &md };
65     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_hindexed_float,
66         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
67     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
68 }
69 
yaksuri_cudai_kernel_unpack_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)70 __global__ void yaksuri_cudai_kernel_unpack_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
71 {
72     const char *__restrict__ sbuf = (const char *) inbuf;
73     char *__restrict__ dbuf = (char *) outbuf;
74     uintptr_t extent = md->extent;
75     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
76     uintptr_t res = idx;
77     uintptr_t inner_elements = md->num_elements;
78 
79     if (idx >= (count * inner_elements))
80         return;
81 
82     uintptr_t x0 = res / inner_elements;
83     res %= inner_elements;
84     inner_elements /= md->u.hvector.count;
85 
86     uintptr_t x1 = res / inner_elements;
87     res %= inner_elements;
88     inner_elements /= md->u.hvector.blocklength;
89     uintptr_t x2 = res / inner_elements;
90     res %= inner_elements;
91     inner_elements /= md->u.hvector.child->u.hindexed.count;
92 
93     uintptr_t x3;
94     for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
95             uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
96                                  md->u.hvector.child->u.hindexed.child->num_elements;
97             if (res < in_elems) {
98                     x3 = i;
99                     res %= in_elems;
100                     inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
101                     break;
102             } else {
103                     res -= in_elems;
104             }
105     }
106 
107     uintptr_t x4 = res;
108 
109     intptr_t stride1 = md->u.hvector.stride;
110     intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
111     uintptr_t extent2 = md->u.hvector.child->extent;
112     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
113 }
114 
yaksuri_cudai_unpack_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)115 void yaksuri_cudai_unpack_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
116 {
117 void *args[] = { &inbuf, &outbuf, &count, &md };
118     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_hindexed_float,
119         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
120     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
121 }
122 
yaksuri_cudai_kernel_pack_hvector_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)123 __global__ void yaksuri_cudai_kernel_pack_hvector_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
124 {
125     const char *__restrict__ sbuf = (const char *) inbuf;
126     char *__restrict__ dbuf = (char *) outbuf;
127     uintptr_t extent = md->extent;
128     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
129     uintptr_t res = idx;
130     uintptr_t inner_elements = md->num_elements;
131 
132     if (idx >= (count * inner_elements))
133         return;
134 
135     uintptr_t x0 = res / inner_elements;
136     res %= inner_elements;
137     inner_elements /= md->u.hvector.count;
138 
139     uintptr_t x1 = res / inner_elements;
140     res %= inner_elements;
141     inner_elements /= md->u.hvector.blocklength;
142     uintptr_t x2 = res / inner_elements;
143     res %= inner_elements;
144     inner_elements /= md->u.hvector.child->u.hvector.count;
145 
146     uintptr_t x3 = res / inner_elements;
147     res %= inner_elements;
148     inner_elements /= md->u.hvector.child->u.hvector.blocklength;
149     uintptr_t x4 = res / inner_elements;
150     res %= inner_elements;
151     inner_elements /= md->u.hvector.child->u.hvector.child->u.hindexed.count;
152 
153     uintptr_t x5;
154     for (int i = 0; i < md->u.hvector.child->u.hvector.child->u.hindexed.count; i++) {
155             uintptr_t in_elems = md->u.hvector.child->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
156                                  md->u.hvector.child->u.hvector.child->u.hindexed.child->num_elements;
157             if (res < in_elems) {
158                     x5 = i;
159                     res %= in_elems;
160                     inner_elements = md->u.hvector.child->u.hvector.child->u.hindexed.child->num_elements;
161                     break;
162             } else {
163                     res -= in_elems;
164             }
165     }
166 
167     uintptr_t x6 = res;
168 
169     intptr_t stride1 = md->u.hvector.stride;
170     intptr_t stride2 = md->u.hvector.child->u.hvector.stride;
171     uintptr_t extent2 = md->u.hvector.child->extent;
172     intptr_t *array_of_displs3 = md->u.hvector.child->u.hvector.child->u.hindexed.array_of_displs;
173     uintptr_t extent3 = md->u.hvector.child->u.hvector.child->extent;
174     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(float)));
175 }
176 
yaksuri_cudai_pack_hvector_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)177 void yaksuri_cudai_pack_hvector_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
178 {
179 void *args[] = { &inbuf, &outbuf, &count, &md };
180     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_hvector_hindexed_float,
181         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
182     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
183 }
184 
yaksuri_cudai_kernel_unpack_hvector_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)185 __global__ void yaksuri_cudai_kernel_unpack_hvector_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
186 {
187     const char *__restrict__ sbuf = (const char *) inbuf;
188     char *__restrict__ dbuf = (char *) outbuf;
189     uintptr_t extent = md->extent;
190     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
191     uintptr_t res = idx;
192     uintptr_t inner_elements = md->num_elements;
193 
194     if (idx >= (count * inner_elements))
195         return;
196 
197     uintptr_t x0 = res / inner_elements;
198     res %= inner_elements;
199     inner_elements /= md->u.hvector.count;
200 
201     uintptr_t x1 = res / inner_elements;
202     res %= inner_elements;
203     inner_elements /= md->u.hvector.blocklength;
204     uintptr_t x2 = res / inner_elements;
205     res %= inner_elements;
206     inner_elements /= md->u.hvector.child->u.hvector.count;
207 
208     uintptr_t x3 = res / inner_elements;
209     res %= inner_elements;
210     inner_elements /= md->u.hvector.child->u.hvector.blocklength;
211     uintptr_t x4 = res / inner_elements;
212     res %= inner_elements;
213     inner_elements /= md->u.hvector.child->u.hvector.child->u.hindexed.count;
214 
215     uintptr_t x5;
216     for (int i = 0; i < md->u.hvector.child->u.hvector.child->u.hindexed.count; i++) {
217             uintptr_t in_elems = md->u.hvector.child->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
218                                  md->u.hvector.child->u.hvector.child->u.hindexed.child->num_elements;
219             if (res < in_elems) {
220                     x5 = i;
221                     res %= in_elems;
222                     inner_elements = md->u.hvector.child->u.hvector.child->u.hindexed.child->num_elements;
223                     break;
224             } else {
225                     res -= in_elems;
226             }
227     }
228 
229     uintptr_t x6 = res;
230 
231     intptr_t stride1 = md->u.hvector.stride;
232     intptr_t stride2 = md->u.hvector.child->u.hvector.stride;
233     uintptr_t extent2 = md->u.hvector.child->extent;
234     intptr_t *array_of_displs3 = md->u.hvector.child->u.hvector.child->u.hindexed.array_of_displs;
235     uintptr_t extent3 = md->u.hvector.child->u.hvector.child->extent;
236     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
237 }
238 
yaksuri_cudai_unpack_hvector_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)239 void yaksuri_cudai_unpack_hvector_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
240 {
241 void *args[] = { &inbuf, &outbuf, &count, &md };
242     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_hvector_hindexed_float,
243         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
244     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
245 }
246 
yaksuri_cudai_kernel_pack_blkhindx_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)247 __global__ void yaksuri_cudai_kernel_pack_blkhindx_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
248 {
249     const char *__restrict__ sbuf = (const char *) inbuf;
250     char *__restrict__ dbuf = (char *) outbuf;
251     uintptr_t extent = md->extent;
252     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
253     uintptr_t res = idx;
254     uintptr_t inner_elements = md->num_elements;
255 
256     if (idx >= (count * inner_elements))
257         return;
258 
259     uintptr_t x0 = res / inner_elements;
260     res %= inner_elements;
261     inner_elements /= md->u.blkhindx.count;
262 
263     uintptr_t x1 = res / inner_elements;
264     res %= inner_elements;
265     inner_elements /= md->u.blkhindx.blocklength;
266     uintptr_t x2 = res / inner_elements;
267     res %= inner_elements;
268     inner_elements /= md->u.blkhindx.child->u.hvector.count;
269 
270     uintptr_t x3 = res / inner_elements;
271     res %= inner_elements;
272     inner_elements /= md->u.blkhindx.child->u.hvector.blocklength;
273     uintptr_t x4 = res / inner_elements;
274     res %= inner_elements;
275     inner_elements /= md->u.blkhindx.child->u.hvector.child->u.hindexed.count;
276 
277     uintptr_t x5;
278     for (int i = 0; i < md->u.blkhindx.child->u.hvector.child->u.hindexed.count; i++) {
279             uintptr_t in_elems = md->u.blkhindx.child->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
280                                  md->u.blkhindx.child->u.hvector.child->u.hindexed.child->num_elements;
281             if (res < in_elems) {
282                     x5 = i;
283                     res %= in_elems;
284                     inner_elements = md->u.blkhindx.child->u.hvector.child->u.hindexed.child->num_elements;
285                     break;
286             } else {
287                     res -= in_elems;
288             }
289     }
290 
291     uintptr_t x6 = res;
292 
293     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
294     intptr_t stride2 = md->u.blkhindx.child->u.hvector.stride;
295     uintptr_t extent2 = md->u.blkhindx.child->extent;
296     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.hvector.child->u.hindexed.array_of_displs;
297     uintptr_t extent3 = md->u.blkhindx.child->u.hvector.child->extent;
298     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(float)));
299 }
300 
yaksuri_cudai_pack_blkhindx_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)301 void yaksuri_cudai_pack_blkhindx_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
302 {
303 void *args[] = { &inbuf, &outbuf, &count, &md };
304     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_hvector_hindexed_float,
305         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
306     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
307 }
308 
yaksuri_cudai_kernel_unpack_blkhindx_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)309 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
310 {
311     const char *__restrict__ sbuf = (const char *) inbuf;
312     char *__restrict__ dbuf = (char *) outbuf;
313     uintptr_t extent = md->extent;
314     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
315     uintptr_t res = idx;
316     uintptr_t inner_elements = md->num_elements;
317 
318     if (idx >= (count * inner_elements))
319         return;
320 
321     uintptr_t x0 = res / inner_elements;
322     res %= inner_elements;
323     inner_elements /= md->u.blkhindx.count;
324 
325     uintptr_t x1 = res / inner_elements;
326     res %= inner_elements;
327     inner_elements /= md->u.blkhindx.blocklength;
328     uintptr_t x2 = res / inner_elements;
329     res %= inner_elements;
330     inner_elements /= md->u.blkhindx.child->u.hvector.count;
331 
332     uintptr_t x3 = res / inner_elements;
333     res %= inner_elements;
334     inner_elements /= md->u.blkhindx.child->u.hvector.blocklength;
335     uintptr_t x4 = res / inner_elements;
336     res %= inner_elements;
337     inner_elements /= md->u.blkhindx.child->u.hvector.child->u.hindexed.count;
338 
339     uintptr_t x5;
340     for (int i = 0; i < md->u.blkhindx.child->u.hvector.child->u.hindexed.count; i++) {
341             uintptr_t in_elems = md->u.blkhindx.child->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
342                                  md->u.blkhindx.child->u.hvector.child->u.hindexed.child->num_elements;
343             if (res < in_elems) {
344                     x5 = i;
345                     res %= in_elems;
346                     inner_elements = md->u.blkhindx.child->u.hvector.child->u.hindexed.child->num_elements;
347                     break;
348             } else {
349                     res -= in_elems;
350             }
351     }
352 
353     uintptr_t x6 = res;
354 
355     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
356     intptr_t stride2 = md->u.blkhindx.child->u.hvector.stride;
357     uintptr_t extent2 = md->u.blkhindx.child->extent;
358     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.hvector.child->u.hindexed.array_of_displs;
359     uintptr_t extent3 = md->u.blkhindx.child->u.hvector.child->extent;
360     *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
361 }
362 
yaksuri_cudai_unpack_blkhindx_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)363 void yaksuri_cudai_unpack_blkhindx_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
364 {
365 void *args[] = { &inbuf, &outbuf, &count, &md };
366     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_hvector_hindexed_float,
367         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
368     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
369 }
370 
yaksuri_cudai_kernel_pack_hindexed_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)371 __global__ void yaksuri_cudai_kernel_pack_hindexed_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
372 {
373     const char *__restrict__ sbuf = (const char *) inbuf;
374     char *__restrict__ dbuf = (char *) outbuf;
375     uintptr_t extent = md->extent;
376     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
377     uintptr_t res = idx;
378     uintptr_t inner_elements = md->num_elements;
379 
380     if (idx >= (count * inner_elements))
381         return;
382 
383     uintptr_t x0 = res / inner_elements;
384     res %= inner_elements;
385     inner_elements /= md->u.hindexed.count;
386 
387     uintptr_t x1;
388     for (int i = 0; i < md->u.hindexed.count; i++) {
389             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
390                                  md->u.hindexed.child->num_elements;
391             if (res < in_elems) {
392                     x1 = i;
393                     res %= in_elems;
394                     inner_elements = md->u.hindexed.child->num_elements;
395                     break;
396             } else {
397                     res -= in_elems;
398             }
399     }
400 
401     uintptr_t x2 = res / inner_elements;
402     res %= inner_elements;
403     inner_elements /= md->u.hindexed.child->u.hvector.count;
404 
405     uintptr_t x3 = res / inner_elements;
406     res %= inner_elements;
407     inner_elements /= md->u.hindexed.child->u.hvector.blocklength;
408     uintptr_t x4 = res / inner_elements;
409     res %= inner_elements;
410     inner_elements /= md->u.hindexed.child->u.hvector.child->u.hindexed.count;
411 
412     uintptr_t x5;
413     for (int i = 0; i < md->u.hindexed.child->u.hvector.child->u.hindexed.count; i++) {
414             uintptr_t in_elems = md->u.hindexed.child->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
415                                  md->u.hindexed.child->u.hvector.child->u.hindexed.child->num_elements;
416             if (res < in_elems) {
417                     x5 = i;
418                     res %= in_elems;
419                     inner_elements = md->u.hindexed.child->u.hvector.child->u.hindexed.child->num_elements;
420                     break;
421             } else {
422                     res -= in_elems;
423             }
424     }
425 
426     uintptr_t x6 = res;
427 
428     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
429     intptr_t stride2 = md->u.hindexed.child->u.hvector.stride;
430     uintptr_t extent2 = md->u.hindexed.child->extent;
431     intptr_t *array_of_displs3 = md->u.hindexed.child->u.hvector.child->u.hindexed.array_of_displs;
432     uintptr_t extent3 = md->u.hindexed.child->u.hvector.child->extent;
433     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(float)));
434 }
435 
yaksuri_cudai_pack_hindexed_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)436 void yaksuri_cudai_pack_hindexed_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
437 {
438 void *args[] = { &inbuf, &outbuf, &count, &md };
439     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_hvector_hindexed_float,
440         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
441     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
442 }
443 
yaksuri_cudai_kernel_unpack_hindexed_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)444 __global__ void yaksuri_cudai_kernel_unpack_hindexed_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
445 {
446     const char *__restrict__ sbuf = (const char *) inbuf;
447     char *__restrict__ dbuf = (char *) outbuf;
448     uintptr_t extent = md->extent;
449     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
450     uintptr_t res = idx;
451     uintptr_t inner_elements = md->num_elements;
452 
453     if (idx >= (count * inner_elements))
454         return;
455 
456     uintptr_t x0 = res / inner_elements;
457     res %= inner_elements;
458     inner_elements /= md->u.hindexed.count;
459 
460     uintptr_t x1;
461     for (int i = 0; i < md->u.hindexed.count; i++) {
462             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
463                                  md->u.hindexed.child->num_elements;
464             if (res < in_elems) {
465                     x1 = i;
466                     res %= in_elems;
467                     inner_elements = md->u.hindexed.child->num_elements;
468                     break;
469             } else {
470                     res -= in_elems;
471             }
472     }
473 
474     uintptr_t x2 = res / inner_elements;
475     res %= inner_elements;
476     inner_elements /= md->u.hindexed.child->u.hvector.count;
477 
478     uintptr_t x3 = res / inner_elements;
479     res %= inner_elements;
480     inner_elements /= md->u.hindexed.child->u.hvector.blocklength;
481     uintptr_t x4 = res / inner_elements;
482     res %= inner_elements;
483     inner_elements /= md->u.hindexed.child->u.hvector.child->u.hindexed.count;
484 
485     uintptr_t x5;
486     for (int i = 0; i < md->u.hindexed.child->u.hvector.child->u.hindexed.count; i++) {
487             uintptr_t in_elems = md->u.hindexed.child->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
488                                  md->u.hindexed.child->u.hvector.child->u.hindexed.child->num_elements;
489             if (res < in_elems) {
490                     x5 = i;
491                     res %= in_elems;
492                     inner_elements = md->u.hindexed.child->u.hvector.child->u.hindexed.child->num_elements;
493                     break;
494             } else {
495                     res -= in_elems;
496             }
497     }
498 
499     uintptr_t x6 = res;
500 
501     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
502     intptr_t stride2 = md->u.hindexed.child->u.hvector.stride;
503     uintptr_t extent2 = md->u.hindexed.child->extent;
504     intptr_t *array_of_displs3 = md->u.hindexed.child->u.hvector.child->u.hindexed.array_of_displs;
505     uintptr_t extent3 = md->u.hindexed.child->u.hvector.child->extent;
506     *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
507 }
508 
yaksuri_cudai_unpack_hindexed_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)509 void yaksuri_cudai_unpack_hindexed_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
510 {
511 void *args[] = { &inbuf, &outbuf, &count, &md };
512     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_hvector_hindexed_float,
513         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
514     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
515 }
516 
yaksuri_cudai_kernel_pack_contig_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)517 __global__ void yaksuri_cudai_kernel_pack_contig_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
518 {
519     const char *__restrict__ sbuf = (const char *) inbuf;
520     char *__restrict__ dbuf = (char *) outbuf;
521     uintptr_t extent = md->extent;
522     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
523     uintptr_t res = idx;
524     uintptr_t inner_elements = md->num_elements;
525 
526     if (idx >= (count * inner_elements))
527         return;
528 
529     uintptr_t x0 = res / inner_elements;
530     res %= inner_elements;
531     inner_elements /= md->u.contig.count;
532 
533     uintptr_t x1 = res / inner_elements;
534     res %= inner_elements;
535     inner_elements /= md->u.contig.child->u.hvector.count;
536 
537     uintptr_t x2 = res / inner_elements;
538     res %= inner_elements;
539     inner_elements /= md->u.contig.child->u.hvector.blocklength;
540     uintptr_t x3 = res / inner_elements;
541     res %= inner_elements;
542     inner_elements /= md->u.contig.child->u.hvector.child->u.hindexed.count;
543 
544     uintptr_t x4;
545     for (int i = 0; i < md->u.contig.child->u.hvector.child->u.hindexed.count; i++) {
546             uintptr_t in_elems = md->u.contig.child->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
547                                  md->u.contig.child->u.hvector.child->u.hindexed.child->num_elements;
548             if (res < in_elems) {
549                     x4 = i;
550                     res %= in_elems;
551                     inner_elements = md->u.contig.child->u.hvector.child->u.hindexed.child->num_elements;
552                     break;
553             } else {
554                     res -= in_elems;
555             }
556     }
557 
558     uintptr_t x5 = res;
559 
560     intptr_t stride1 = md->u.contig.child->extent;
561     intptr_t stride2 = md->u.contig.child->u.hvector.stride;
562     intptr_t *array_of_displs3 = md->u.contig.child->u.hvector.child->u.hindexed.array_of_displs;
563     uintptr_t extent3 = md->u.contig.child->u.hvector.child->extent;
564     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * stride2 + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(float)));
565 }
566 
yaksuri_cudai_pack_contig_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)567 void yaksuri_cudai_pack_contig_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
568 {
569 void *args[] = { &inbuf, &outbuf, &count, &md };
570     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_hvector_hindexed_float,
571         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
572     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
573 }
574 
yaksuri_cudai_kernel_unpack_contig_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)575 __global__ void yaksuri_cudai_kernel_unpack_contig_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
576 {
577     const char *__restrict__ sbuf = (const char *) inbuf;
578     char *__restrict__ dbuf = (char *) outbuf;
579     uintptr_t extent = md->extent;
580     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
581     uintptr_t res = idx;
582     uintptr_t inner_elements = md->num_elements;
583 
584     if (idx >= (count * inner_elements))
585         return;
586 
587     uintptr_t x0 = res / inner_elements;
588     res %= inner_elements;
589     inner_elements /= md->u.contig.count;
590 
591     uintptr_t x1 = res / inner_elements;
592     res %= inner_elements;
593     inner_elements /= md->u.contig.child->u.hvector.count;
594 
595     uintptr_t x2 = res / inner_elements;
596     res %= inner_elements;
597     inner_elements /= md->u.contig.child->u.hvector.blocklength;
598     uintptr_t x3 = res / inner_elements;
599     res %= inner_elements;
600     inner_elements /= md->u.contig.child->u.hvector.child->u.hindexed.count;
601 
602     uintptr_t x4;
603     for (int i = 0; i < md->u.contig.child->u.hvector.child->u.hindexed.count; i++) {
604             uintptr_t in_elems = md->u.contig.child->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
605                                  md->u.contig.child->u.hvector.child->u.hindexed.child->num_elements;
606             if (res < in_elems) {
607                     x4 = i;
608                     res %= in_elems;
609                     inner_elements = md->u.contig.child->u.hvector.child->u.hindexed.child->num_elements;
610                     break;
611             } else {
612                     res -= in_elems;
613             }
614     }
615 
616     uintptr_t x5 = res;
617 
618     intptr_t stride1 = md->u.contig.child->extent;
619     intptr_t stride2 = md->u.contig.child->u.hvector.stride;
620     intptr_t *array_of_displs3 = md->u.contig.child->u.hvector.child->u.hindexed.array_of_displs;
621     uintptr_t extent3 = md->u.contig.child->u.hvector.child->extent;
622     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * stride2 + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
623 }
624 
yaksuri_cudai_unpack_contig_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)625 void yaksuri_cudai_unpack_contig_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
626 {
627 void *args[] = { &inbuf, &outbuf, &count, &md };
628     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_hvector_hindexed_float,
629         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
630     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
631 }
632 
yaksuri_cudai_kernel_pack_resized_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)633 __global__ void yaksuri_cudai_kernel_pack_resized_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
634 {
635     const char *__restrict__ sbuf = (const char *) inbuf;
636     char *__restrict__ dbuf = (char *) outbuf;
637     uintptr_t extent = md->extent;
638     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
639     uintptr_t res = idx;
640     uintptr_t inner_elements = md->num_elements;
641 
642     if (idx >= (count * inner_elements))
643         return;
644 
645     uintptr_t x0 = res / inner_elements;
646     res %= inner_elements;
647     inner_elements /= md->u.resized.child->u.hvector.count;
648 
649     uintptr_t x1 = res / inner_elements;
650     res %= inner_elements;
651     inner_elements /= md->u.resized.child->u.hvector.blocklength;
652     uintptr_t x2 = res / inner_elements;
653     res %= inner_elements;
654     inner_elements /= md->u.resized.child->u.hvector.child->u.hindexed.count;
655 
656     uintptr_t x3;
657     for (int i = 0; i < md->u.resized.child->u.hvector.child->u.hindexed.count; i++) {
658             uintptr_t in_elems = md->u.resized.child->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
659                                  md->u.resized.child->u.hvector.child->u.hindexed.child->num_elements;
660             if (res < in_elems) {
661                     x3 = i;
662                     res %= in_elems;
663                     inner_elements = md->u.resized.child->u.hvector.child->u.hindexed.child->num_elements;
664                     break;
665             } else {
666                     res -= in_elems;
667             }
668     }
669 
670     uintptr_t x4 = res;
671 
672     intptr_t stride2 = md->u.resized.child->u.hvector.stride;
673     intptr_t *array_of_displs3 = md->u.resized.child->u.hvector.child->u.hindexed.array_of_displs;
674     uintptr_t extent3 = md->u.resized.child->u.hvector.child->extent;
675     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride2 + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(float)));
676 }
677 
yaksuri_cudai_pack_resized_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)678 void yaksuri_cudai_pack_resized_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
679 {
680 void *args[] = { &inbuf, &outbuf, &count, &md };
681     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_hvector_hindexed_float,
682         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
683     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
684 }
685 
yaksuri_cudai_kernel_unpack_resized_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)686 __global__ void yaksuri_cudai_kernel_unpack_resized_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
687 {
688     const char *__restrict__ sbuf = (const char *) inbuf;
689     char *__restrict__ dbuf = (char *) outbuf;
690     uintptr_t extent = md->extent;
691     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
692     uintptr_t res = idx;
693     uintptr_t inner_elements = md->num_elements;
694 
695     if (idx >= (count * inner_elements))
696         return;
697 
698     uintptr_t x0 = res / inner_elements;
699     res %= inner_elements;
700     inner_elements /= md->u.resized.child->u.hvector.count;
701 
702     uintptr_t x1 = res / inner_elements;
703     res %= inner_elements;
704     inner_elements /= md->u.resized.child->u.hvector.blocklength;
705     uintptr_t x2 = res / inner_elements;
706     res %= inner_elements;
707     inner_elements /= md->u.resized.child->u.hvector.child->u.hindexed.count;
708 
709     uintptr_t x3;
710     for (int i = 0; i < md->u.resized.child->u.hvector.child->u.hindexed.count; i++) {
711             uintptr_t in_elems = md->u.resized.child->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
712                                  md->u.resized.child->u.hvector.child->u.hindexed.child->num_elements;
713             if (res < in_elems) {
714                     x3 = i;
715                     res %= in_elems;
716                     inner_elements = md->u.resized.child->u.hvector.child->u.hindexed.child->num_elements;
717                     break;
718             } else {
719                     res -= in_elems;
720             }
721     }
722 
723     uintptr_t x4 = res;
724 
725     intptr_t stride2 = md->u.resized.child->u.hvector.stride;
726     intptr_t *array_of_displs3 = md->u.resized.child->u.hvector.child->u.hindexed.array_of_displs;
727     uintptr_t extent3 = md->u.resized.child->u.hvector.child->extent;
728     *((float *) (void *) (dbuf + x0 * extent + x1 * stride2 + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
729 }
730 
yaksuri_cudai_unpack_resized_hvector_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)731 void yaksuri_cudai_unpack_resized_hvector_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
732 {
733 void *args[] = { &inbuf, &outbuf, &count, &md };
734     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_hvector_hindexed_float,
735         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
736     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
737 }
738 
739