1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.hindexed.count;
32 
33     uintptr_t x1;
34     for (int i = 0; i < md->u.hindexed.count; i++) {
35             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
36                                  md->u.hindexed.child->num_elements;
37             if (res < in_elems) {
38                     x1 = i;
39                     res %= in_elems;
40                     inner_elements = md->u.hindexed.child->num_elements;
41                     break;
42             } else {
43                     res -= in_elems;
44             }
45     }
46 
47     uintptr_t x2 = res;
48 
49     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
50     uintptr_t extent2 = md->u.hindexed.child->extent;
51     *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2));
52 }
53 
yaksuri_cudai_pack_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)54 void yaksuri_cudai_pack_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
55 {
56 void *args[] = { &inbuf, &outbuf, &count, &md };
57     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_resized_int64_t,
58         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
59     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
60 }
61 
yaksuri_cudai_kernel_unpack_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)62 __global__ void yaksuri_cudai_kernel_unpack_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
63 {
64     const char *__restrict__ sbuf = (const char *) inbuf;
65     char *__restrict__ dbuf = (char *) outbuf;
66     uintptr_t extent = md->extent;
67     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
68     uintptr_t res = idx;
69     uintptr_t inner_elements = md->num_elements;
70 
71     if (idx >= (count * inner_elements))
72         return;
73 
74     uintptr_t x0 = res / inner_elements;
75     res %= inner_elements;
76     inner_elements /= md->u.hindexed.count;
77 
78     uintptr_t x1;
79     for (int i = 0; i < md->u.hindexed.count; i++) {
80             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
81                                  md->u.hindexed.child->num_elements;
82             if (res < in_elems) {
83                     x1 = i;
84                     res %= in_elems;
85                     inner_elements = md->u.hindexed.child->num_elements;
86                     break;
87             } else {
88                     res -= in_elems;
89             }
90     }
91 
92     uintptr_t x2 = res;
93 
94     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
95     uintptr_t extent2 = md->u.hindexed.child->extent;
96     *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2)) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
97 }
98 
yaksuri_cudai_unpack_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)99 void yaksuri_cudai_unpack_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
100 {
101 void *args[] = { &inbuf, &outbuf, &count, &md };
102     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_resized_int64_t,
103         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
104     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
105 }
106 
yaksuri_cudai_kernel_pack_hvector_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)107 __global__ void yaksuri_cudai_kernel_pack_hvector_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
108 {
109     const char *__restrict__ sbuf = (const char *) inbuf;
110     char *__restrict__ dbuf = (char *) outbuf;
111     uintptr_t extent = md->extent;
112     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
113     uintptr_t res = idx;
114     uintptr_t inner_elements = md->num_elements;
115 
116     if (idx >= (count * inner_elements))
117         return;
118 
119     uintptr_t x0 = res / inner_elements;
120     res %= inner_elements;
121     inner_elements /= md->u.hvector.count;
122 
123     uintptr_t x1 = res / inner_elements;
124     res %= inner_elements;
125     inner_elements /= md->u.hvector.blocklength;
126     uintptr_t x2 = res / inner_elements;
127     res %= inner_elements;
128     inner_elements /= md->u.hvector.child->u.hindexed.count;
129 
130     uintptr_t x3;
131     for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
132             uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
133                                  md->u.hvector.child->u.hindexed.child->num_elements;
134             if (res < in_elems) {
135                     x3 = i;
136                     res %= in_elems;
137                     inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
138                     break;
139             } else {
140                     res -= in_elems;
141             }
142     }
143 
144     uintptr_t x4 = res;
145 
146     intptr_t stride1 = md->u.hvector.stride;
147     intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
148     uintptr_t extent2 = md->u.hvector.child->extent;
149     uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
150     *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3));
151 }
152 
yaksuri_cudai_pack_hvector_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)153 void yaksuri_cudai_pack_hvector_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
154 {
155 void *args[] = { &inbuf, &outbuf, &count, &md };
156     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_hindexed_resized_int64_t,
157         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
158     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
159 }
160 
yaksuri_cudai_kernel_unpack_hvector_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)161 __global__ void yaksuri_cudai_kernel_unpack_hvector_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
162 {
163     const char *__restrict__ sbuf = (const char *) inbuf;
164     char *__restrict__ dbuf = (char *) outbuf;
165     uintptr_t extent = md->extent;
166     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
167     uintptr_t res = idx;
168     uintptr_t inner_elements = md->num_elements;
169 
170     if (idx >= (count * inner_elements))
171         return;
172 
173     uintptr_t x0 = res / inner_elements;
174     res %= inner_elements;
175     inner_elements /= md->u.hvector.count;
176 
177     uintptr_t x1 = res / inner_elements;
178     res %= inner_elements;
179     inner_elements /= md->u.hvector.blocklength;
180     uintptr_t x2 = res / inner_elements;
181     res %= inner_elements;
182     inner_elements /= md->u.hvector.child->u.hindexed.count;
183 
184     uintptr_t x3;
185     for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
186             uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
187                                  md->u.hvector.child->u.hindexed.child->num_elements;
188             if (res < in_elems) {
189                     x3 = i;
190                     res %= in_elems;
191                     inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
192                     break;
193             } else {
194                     res -= in_elems;
195             }
196     }
197 
198     uintptr_t x4 = res;
199 
200     intptr_t stride1 = md->u.hvector.stride;
201     intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
202     uintptr_t extent2 = md->u.hvector.child->extent;
203     uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
204     *((int64_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3)) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
205 }
206 
yaksuri_cudai_unpack_hvector_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)207 void yaksuri_cudai_unpack_hvector_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
208 {
209 void *args[] = { &inbuf, &outbuf, &count, &md };
210     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_hindexed_resized_int64_t,
211         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
212     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
213 }
214 
yaksuri_cudai_kernel_pack_blkhindx_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)215 __global__ void yaksuri_cudai_kernel_pack_blkhindx_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
216 {
217     const char *__restrict__ sbuf = (const char *) inbuf;
218     char *__restrict__ dbuf = (char *) outbuf;
219     uintptr_t extent = md->extent;
220     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
221     uintptr_t res = idx;
222     uintptr_t inner_elements = md->num_elements;
223 
224     if (idx >= (count * inner_elements))
225         return;
226 
227     uintptr_t x0 = res / inner_elements;
228     res %= inner_elements;
229     inner_elements /= md->u.blkhindx.count;
230 
231     uintptr_t x1 = res / inner_elements;
232     res %= inner_elements;
233     inner_elements /= md->u.blkhindx.blocklength;
234     uintptr_t x2 = res / inner_elements;
235     res %= inner_elements;
236     inner_elements /= md->u.blkhindx.child->u.hindexed.count;
237 
238     uintptr_t x3;
239     for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
240             uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
241                                  md->u.blkhindx.child->u.hindexed.child->num_elements;
242             if (res < in_elems) {
243                     x3 = i;
244                     res %= in_elems;
245                     inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
246                     break;
247             } else {
248                     res -= in_elems;
249             }
250     }
251 
252     uintptr_t x4 = res;
253 
254     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
255     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
256     uintptr_t extent2 = md->u.blkhindx.child->extent;
257     uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
258     *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3));
259 }
260 
yaksuri_cudai_pack_blkhindx_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)261 void yaksuri_cudai_pack_blkhindx_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
262 {
263 void *args[] = { &inbuf, &outbuf, &count, &md };
264     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_hindexed_resized_int64_t,
265         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
266     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
267 }
268 
yaksuri_cudai_kernel_unpack_blkhindx_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)269 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
270 {
271     const char *__restrict__ sbuf = (const char *) inbuf;
272     char *__restrict__ dbuf = (char *) outbuf;
273     uintptr_t extent = md->extent;
274     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
275     uintptr_t res = idx;
276     uintptr_t inner_elements = md->num_elements;
277 
278     if (idx >= (count * inner_elements))
279         return;
280 
281     uintptr_t x0 = res / inner_elements;
282     res %= inner_elements;
283     inner_elements /= md->u.blkhindx.count;
284 
285     uintptr_t x1 = res / inner_elements;
286     res %= inner_elements;
287     inner_elements /= md->u.blkhindx.blocklength;
288     uintptr_t x2 = res / inner_elements;
289     res %= inner_elements;
290     inner_elements /= md->u.blkhindx.child->u.hindexed.count;
291 
292     uintptr_t x3;
293     for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
294             uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
295                                  md->u.blkhindx.child->u.hindexed.child->num_elements;
296             if (res < in_elems) {
297                     x3 = i;
298                     res %= in_elems;
299                     inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
300                     break;
301             } else {
302                     res -= in_elems;
303             }
304     }
305 
306     uintptr_t x4 = res;
307 
308     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
309     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
310     uintptr_t extent2 = md->u.blkhindx.child->extent;
311     uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
312     *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3)) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
313 }
314 
yaksuri_cudai_unpack_blkhindx_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)315 void yaksuri_cudai_unpack_blkhindx_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
316 {
317 void *args[] = { &inbuf, &outbuf, &count, &md };
318     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_hindexed_resized_int64_t,
319         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
320     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
321 }
322 
yaksuri_cudai_kernel_pack_hindexed_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)323 __global__ void yaksuri_cudai_kernel_pack_hindexed_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
324 {
325     const char *__restrict__ sbuf = (const char *) inbuf;
326     char *__restrict__ dbuf = (char *) outbuf;
327     uintptr_t extent = md->extent;
328     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
329     uintptr_t res = idx;
330     uintptr_t inner_elements = md->num_elements;
331 
332     if (idx >= (count * inner_elements))
333         return;
334 
335     uintptr_t x0 = res / inner_elements;
336     res %= inner_elements;
337     inner_elements /= md->u.hindexed.count;
338 
339     uintptr_t x1;
340     for (int i = 0; i < md->u.hindexed.count; i++) {
341             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
342                                  md->u.hindexed.child->num_elements;
343             if (res < in_elems) {
344                     x1 = i;
345                     res %= in_elems;
346                     inner_elements = md->u.hindexed.child->num_elements;
347                     break;
348             } else {
349                     res -= in_elems;
350             }
351     }
352 
353     uintptr_t x2 = res / inner_elements;
354     res %= inner_elements;
355     inner_elements /= md->u.hindexed.child->u.hindexed.count;
356 
357     uintptr_t x3;
358     for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
359             uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
360                                  md->u.hindexed.child->u.hindexed.child->num_elements;
361             if (res < in_elems) {
362                     x3 = i;
363                     res %= in_elems;
364                     inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
365                     break;
366             } else {
367                     res -= in_elems;
368             }
369     }
370 
371     uintptr_t x4 = res;
372 
373     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
374     intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
375     uintptr_t extent2 = md->u.hindexed.child->extent;
376     uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
377     *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3));
378 }
379 
yaksuri_cudai_pack_hindexed_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)380 void yaksuri_cudai_pack_hindexed_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
381 {
382 void *args[] = { &inbuf, &outbuf, &count, &md };
383     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_hindexed_resized_int64_t,
384         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
385     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
386 }
387 
yaksuri_cudai_kernel_unpack_hindexed_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)388 __global__ void yaksuri_cudai_kernel_unpack_hindexed_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
389 {
390     const char *__restrict__ sbuf = (const char *) inbuf;
391     char *__restrict__ dbuf = (char *) outbuf;
392     uintptr_t extent = md->extent;
393     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
394     uintptr_t res = idx;
395     uintptr_t inner_elements = md->num_elements;
396 
397     if (idx >= (count * inner_elements))
398         return;
399 
400     uintptr_t x0 = res / inner_elements;
401     res %= inner_elements;
402     inner_elements /= md->u.hindexed.count;
403 
404     uintptr_t x1;
405     for (int i = 0; i < md->u.hindexed.count; i++) {
406             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
407                                  md->u.hindexed.child->num_elements;
408             if (res < in_elems) {
409                     x1 = i;
410                     res %= in_elems;
411                     inner_elements = md->u.hindexed.child->num_elements;
412                     break;
413             } else {
414                     res -= in_elems;
415             }
416     }
417 
418     uintptr_t x2 = res / inner_elements;
419     res %= inner_elements;
420     inner_elements /= md->u.hindexed.child->u.hindexed.count;
421 
422     uintptr_t x3;
423     for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
424             uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
425                                  md->u.hindexed.child->u.hindexed.child->num_elements;
426             if (res < in_elems) {
427                     x3 = i;
428                     res %= in_elems;
429                     inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
430                     break;
431             } else {
432                     res -= in_elems;
433             }
434     }
435 
436     uintptr_t x4 = res;
437 
438     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
439     intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
440     uintptr_t extent2 = md->u.hindexed.child->extent;
441     uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
442     *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3)) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
443 }
444 
yaksuri_cudai_unpack_hindexed_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)445 void yaksuri_cudai_unpack_hindexed_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
446 {
447 void *args[] = { &inbuf, &outbuf, &count, &md };
448     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_hindexed_resized_int64_t,
449         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
450     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
451 }
452 
yaksuri_cudai_kernel_pack_contig_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)453 __global__ void yaksuri_cudai_kernel_pack_contig_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
454 {
455     const char *__restrict__ sbuf = (const char *) inbuf;
456     char *__restrict__ dbuf = (char *) outbuf;
457     uintptr_t extent = md->extent;
458     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
459     uintptr_t res = idx;
460     uintptr_t inner_elements = md->num_elements;
461 
462     if (idx >= (count * inner_elements))
463         return;
464 
465     uintptr_t x0 = res / inner_elements;
466     res %= inner_elements;
467     inner_elements /= md->u.contig.count;
468 
469     uintptr_t x1 = res / inner_elements;
470     res %= inner_elements;
471     inner_elements /= md->u.contig.child->u.hindexed.count;
472 
473     uintptr_t x2;
474     for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
475             uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
476                                  md->u.contig.child->u.hindexed.child->num_elements;
477             if (res < in_elems) {
478                     x2 = i;
479                     res %= in_elems;
480                     inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
481                     break;
482             } else {
483                     res -= in_elems;
484             }
485     }
486 
487     uintptr_t x3 = res;
488 
489     intptr_t stride1 = md->u.contig.child->extent;
490     intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
491     uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
492     *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3));
493 }
494 
yaksuri_cudai_pack_contig_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)495 void yaksuri_cudai_pack_contig_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
496 {
497 void *args[] = { &inbuf, &outbuf, &count, &md };
498     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_hindexed_resized_int64_t,
499         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
500     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
501 }
502 
yaksuri_cudai_kernel_unpack_contig_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)503 __global__ void yaksuri_cudai_kernel_unpack_contig_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
504 {
505     const char *__restrict__ sbuf = (const char *) inbuf;
506     char *__restrict__ dbuf = (char *) outbuf;
507     uintptr_t extent = md->extent;
508     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
509     uintptr_t res = idx;
510     uintptr_t inner_elements = md->num_elements;
511 
512     if (idx >= (count * inner_elements))
513         return;
514 
515     uintptr_t x0 = res / inner_elements;
516     res %= inner_elements;
517     inner_elements /= md->u.contig.count;
518 
519     uintptr_t x1 = res / inner_elements;
520     res %= inner_elements;
521     inner_elements /= md->u.contig.child->u.hindexed.count;
522 
523     uintptr_t x2;
524     for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
525             uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
526                                  md->u.contig.child->u.hindexed.child->num_elements;
527             if (res < in_elems) {
528                     x2 = i;
529                     res %= in_elems;
530                     inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
531                     break;
532             } else {
533                     res -= in_elems;
534             }
535     }
536 
537     uintptr_t x3 = res;
538 
539     intptr_t stride1 = md->u.contig.child->extent;
540     intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
541     uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
542     *((int64_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3)) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
543 }
544 
yaksuri_cudai_unpack_contig_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)545 void yaksuri_cudai_unpack_contig_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
546 {
547 void *args[] = { &inbuf, &outbuf, &count, &md };
548     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_hindexed_resized_int64_t,
549         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
550     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
551 }
552 
yaksuri_cudai_kernel_pack_resized_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)553 __global__ void yaksuri_cudai_kernel_pack_resized_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
554 {
555     const char *__restrict__ sbuf = (const char *) inbuf;
556     char *__restrict__ dbuf = (char *) outbuf;
557     uintptr_t extent = md->extent;
558     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
559     uintptr_t res = idx;
560     uintptr_t inner_elements = md->num_elements;
561 
562     if (idx >= (count * inner_elements))
563         return;
564 
565     uintptr_t x0 = res / inner_elements;
566     res %= inner_elements;
567     inner_elements /= md->u.resized.child->u.hindexed.count;
568 
569     uintptr_t x1;
570     for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
571             uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
572                                  md->u.resized.child->u.hindexed.child->num_elements;
573             if (res < in_elems) {
574                     x1 = i;
575                     res %= in_elems;
576                     inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
577                     break;
578             } else {
579                     res -= in_elems;
580             }
581     }
582 
583     uintptr_t x2 = res;
584 
585     intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
586     uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
587     *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3));
588 }
589 
yaksuri_cudai_pack_resized_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)590 void yaksuri_cudai_pack_resized_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
591 {
592 void *args[] = { &inbuf, &outbuf, &count, &md };
593     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_hindexed_resized_int64_t,
594         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
595     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
596 }
597 
yaksuri_cudai_kernel_unpack_resized_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)598 __global__ void yaksuri_cudai_kernel_unpack_resized_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
599 {
600     const char *__restrict__ sbuf = (const char *) inbuf;
601     char *__restrict__ dbuf = (char *) outbuf;
602     uintptr_t extent = md->extent;
603     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
604     uintptr_t res = idx;
605     uintptr_t inner_elements = md->num_elements;
606 
607     if (idx >= (count * inner_elements))
608         return;
609 
610     uintptr_t x0 = res / inner_elements;
611     res %= inner_elements;
612     inner_elements /= md->u.resized.child->u.hindexed.count;
613 
614     uintptr_t x1;
615     for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
616             uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
617                                  md->u.resized.child->u.hindexed.child->num_elements;
618             if (res < in_elems) {
619                     x1 = i;
620                     res %= in_elems;
621                     inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
622                     break;
623             } else {
624                     res -= in_elems;
625             }
626     }
627 
628     uintptr_t x2 = res;
629 
630     intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
631     uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
632     *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3)) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
633 }
634 
yaksuri_cudai_unpack_resized_hindexed_resized_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)635 void yaksuri_cudai_unpack_resized_hindexed_resized_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
636 {
637 void *args[] = { &inbuf, &outbuf, &count, &md };
638     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_hindexed_resized_int64_t,
639         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
640     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
641 }
642 
643