1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.hindexed.count;
32 
33     uintptr_t x1;
34     for (int i = 0; i < md->u.hindexed.count; i++) {
35             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
36                                  md->u.hindexed.child->num_elements;
37             if (res < in_elems) {
38                     x1 = i;
39                     res %= in_elems;
40                     inner_elements = md->u.hindexed.child->num_elements;
41                     break;
42             } else {
43                     res -= in_elems;
44             }
45     }
46 
47     uintptr_t x2 = res / inner_elements;
48     res %= inner_elements;
49     inner_elements /= md->u.hindexed.child->u.hindexed.count;
50 
51     uintptr_t x3;
52     for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
53             uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
54                                  md->u.hindexed.child->u.hindexed.child->num_elements;
55             if (res < in_elems) {
56                     x3 = i;
57                     res %= in_elems;
58                     inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
59                     break;
60             } else {
61                     res -= in_elems;
62             }
63     }
64 
65     uintptr_t x4 = res;
66 
67     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
68     intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
69     uintptr_t extent2 = md->u.hindexed.child->extent;
70     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(float)));
71 }
72 
yaksuri_cudai_pack_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)73 void yaksuri_cudai_pack_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
74 {
75 void *args[] = { &inbuf, &outbuf, &count, &md };
76     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_hindexed_float,
77         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
78     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
79 }
80 
yaksuri_cudai_kernel_unpack_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)81 __global__ void yaksuri_cudai_kernel_unpack_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
82 {
83     const char *__restrict__ sbuf = (const char *) inbuf;
84     char *__restrict__ dbuf = (char *) outbuf;
85     uintptr_t extent = md->extent;
86     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
87     uintptr_t res = idx;
88     uintptr_t inner_elements = md->num_elements;
89 
90     if (idx >= (count * inner_elements))
91         return;
92 
93     uintptr_t x0 = res / inner_elements;
94     res %= inner_elements;
95     inner_elements /= md->u.hindexed.count;
96 
97     uintptr_t x1;
98     for (int i = 0; i < md->u.hindexed.count; i++) {
99             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
100                                  md->u.hindexed.child->num_elements;
101             if (res < in_elems) {
102                     x1 = i;
103                     res %= in_elems;
104                     inner_elements = md->u.hindexed.child->num_elements;
105                     break;
106             } else {
107                     res -= in_elems;
108             }
109     }
110 
111     uintptr_t x2 = res / inner_elements;
112     res %= inner_elements;
113     inner_elements /= md->u.hindexed.child->u.hindexed.count;
114 
115     uintptr_t x3;
116     for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
117             uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
118                                  md->u.hindexed.child->u.hindexed.child->num_elements;
119             if (res < in_elems) {
120                     x3 = i;
121                     res %= in_elems;
122                     inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
123                     break;
124             } else {
125                     res -= in_elems;
126             }
127     }
128 
129     uintptr_t x4 = res;
130 
131     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
132     intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
133     uintptr_t extent2 = md->u.hindexed.child->extent;
134     *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
135 }
136 
yaksuri_cudai_unpack_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)137 void yaksuri_cudai_unpack_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
138 {
139 void *args[] = { &inbuf, &outbuf, &count, &md };
140     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_hindexed_float,
141         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
142     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
143 }
144 
yaksuri_cudai_kernel_pack_hvector_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)145 __global__ void yaksuri_cudai_kernel_pack_hvector_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
146 {
147     const char *__restrict__ sbuf = (const char *) inbuf;
148     char *__restrict__ dbuf = (char *) outbuf;
149     uintptr_t extent = md->extent;
150     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
151     uintptr_t res = idx;
152     uintptr_t inner_elements = md->num_elements;
153 
154     if (idx >= (count * inner_elements))
155         return;
156 
157     uintptr_t x0 = res / inner_elements;
158     res %= inner_elements;
159     inner_elements /= md->u.hvector.count;
160 
161     uintptr_t x1 = res / inner_elements;
162     res %= inner_elements;
163     inner_elements /= md->u.hvector.blocklength;
164     uintptr_t x2 = res / inner_elements;
165     res %= inner_elements;
166     inner_elements /= md->u.hvector.child->u.hindexed.count;
167 
168     uintptr_t x3;
169     for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
170             uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
171                                  md->u.hvector.child->u.hindexed.child->num_elements;
172             if (res < in_elems) {
173                     x3 = i;
174                     res %= in_elems;
175                     inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
176                     break;
177             } else {
178                     res -= in_elems;
179             }
180     }
181 
182     uintptr_t x4 = res / inner_elements;
183     res %= inner_elements;
184     inner_elements /= md->u.hvector.child->u.hindexed.child->u.hindexed.count;
185 
186     uintptr_t x5;
187     for (int i = 0; i < md->u.hvector.child->u.hindexed.child->u.hindexed.count; i++) {
188             uintptr_t in_elems = md->u.hvector.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
189                                  md->u.hvector.child->u.hindexed.child->u.hindexed.child->num_elements;
190             if (res < in_elems) {
191                     x5 = i;
192                     res %= in_elems;
193                     inner_elements = md->u.hvector.child->u.hindexed.child->u.hindexed.child->num_elements;
194                     break;
195             } else {
196                     res -= in_elems;
197             }
198     }
199 
200     uintptr_t x6 = res;
201 
202     intptr_t stride1 = md->u.hvector.stride;
203     intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
204     uintptr_t extent2 = md->u.hvector.child->extent;
205     intptr_t *array_of_displs3 = md->u.hvector.child->u.hindexed.child->u.hindexed.array_of_displs;
206     uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
207     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(float)));
208 }
209 
yaksuri_cudai_pack_hvector_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)210 void yaksuri_cudai_pack_hvector_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
211 {
212 void *args[] = { &inbuf, &outbuf, &count, &md };
213     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_hindexed_hindexed_float,
214         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
215     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
216 }
217 
yaksuri_cudai_kernel_unpack_hvector_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)218 __global__ void yaksuri_cudai_kernel_unpack_hvector_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
219 {
220     const char *__restrict__ sbuf = (const char *) inbuf;
221     char *__restrict__ dbuf = (char *) outbuf;
222     uintptr_t extent = md->extent;
223     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
224     uintptr_t res = idx;
225     uintptr_t inner_elements = md->num_elements;
226 
227     if (idx >= (count * inner_elements))
228         return;
229 
230     uintptr_t x0 = res / inner_elements;
231     res %= inner_elements;
232     inner_elements /= md->u.hvector.count;
233 
234     uintptr_t x1 = res / inner_elements;
235     res %= inner_elements;
236     inner_elements /= md->u.hvector.blocklength;
237     uintptr_t x2 = res / inner_elements;
238     res %= inner_elements;
239     inner_elements /= md->u.hvector.child->u.hindexed.count;
240 
241     uintptr_t x3;
242     for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
243             uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
244                                  md->u.hvector.child->u.hindexed.child->num_elements;
245             if (res < in_elems) {
246                     x3 = i;
247                     res %= in_elems;
248                     inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
249                     break;
250             } else {
251                     res -= in_elems;
252             }
253     }
254 
255     uintptr_t x4 = res / inner_elements;
256     res %= inner_elements;
257     inner_elements /= md->u.hvector.child->u.hindexed.child->u.hindexed.count;
258 
259     uintptr_t x5;
260     for (int i = 0; i < md->u.hvector.child->u.hindexed.child->u.hindexed.count; i++) {
261             uintptr_t in_elems = md->u.hvector.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
262                                  md->u.hvector.child->u.hindexed.child->u.hindexed.child->num_elements;
263             if (res < in_elems) {
264                     x5 = i;
265                     res %= in_elems;
266                     inner_elements = md->u.hvector.child->u.hindexed.child->u.hindexed.child->num_elements;
267                     break;
268             } else {
269                     res -= in_elems;
270             }
271     }
272 
273     uintptr_t x6 = res;
274 
275     intptr_t stride1 = md->u.hvector.stride;
276     intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
277     uintptr_t extent2 = md->u.hvector.child->extent;
278     intptr_t *array_of_displs3 = md->u.hvector.child->u.hindexed.child->u.hindexed.array_of_displs;
279     uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
280     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
281 }
282 
yaksuri_cudai_unpack_hvector_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)283 void yaksuri_cudai_unpack_hvector_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
284 {
285 void *args[] = { &inbuf, &outbuf, &count, &md };
286     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_hindexed_hindexed_float,
287         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
288     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
289 }
290 
yaksuri_cudai_kernel_pack_blkhindx_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)291 __global__ void yaksuri_cudai_kernel_pack_blkhindx_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
292 {
293     const char *__restrict__ sbuf = (const char *) inbuf;
294     char *__restrict__ dbuf = (char *) outbuf;
295     uintptr_t extent = md->extent;
296     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
297     uintptr_t res = idx;
298     uintptr_t inner_elements = md->num_elements;
299 
300     if (idx >= (count * inner_elements))
301         return;
302 
303     uintptr_t x0 = res / inner_elements;
304     res %= inner_elements;
305     inner_elements /= md->u.blkhindx.count;
306 
307     uintptr_t x1 = res / inner_elements;
308     res %= inner_elements;
309     inner_elements /= md->u.blkhindx.blocklength;
310     uintptr_t x2 = res / inner_elements;
311     res %= inner_elements;
312     inner_elements /= md->u.blkhindx.child->u.hindexed.count;
313 
314     uintptr_t x3;
315     for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
316             uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
317                                  md->u.blkhindx.child->u.hindexed.child->num_elements;
318             if (res < in_elems) {
319                     x3 = i;
320                     res %= in_elems;
321                     inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
322                     break;
323             } else {
324                     res -= in_elems;
325             }
326     }
327 
328     uintptr_t x4 = res / inner_elements;
329     res %= inner_elements;
330     inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.hindexed.count;
331 
332     uintptr_t x5;
333     for (int i = 0; i < md->u.blkhindx.child->u.hindexed.child->u.hindexed.count; i++) {
334             uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
335                                  md->u.blkhindx.child->u.hindexed.child->u.hindexed.child->num_elements;
336             if (res < in_elems) {
337                     x5 = i;
338                     res %= in_elems;
339                     inner_elements = md->u.blkhindx.child->u.hindexed.child->u.hindexed.child->num_elements;
340                     break;
341             } else {
342                     res -= in_elems;
343             }
344     }
345 
346     uintptr_t x6 = res;
347 
348     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
349     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
350     uintptr_t extent2 = md->u.blkhindx.child->extent;
351     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.hindexed.child->u.hindexed.array_of_displs;
352     uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
353     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(float)));
354 }
355 
yaksuri_cudai_pack_blkhindx_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)356 void yaksuri_cudai_pack_blkhindx_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
357 {
358 void *args[] = { &inbuf, &outbuf, &count, &md };
359     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_hindexed_hindexed_float,
360         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
361     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
362 }
363 
yaksuri_cudai_kernel_unpack_blkhindx_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)364 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
365 {
366     const char *__restrict__ sbuf = (const char *) inbuf;
367     char *__restrict__ dbuf = (char *) outbuf;
368     uintptr_t extent = md->extent;
369     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
370     uintptr_t res = idx;
371     uintptr_t inner_elements = md->num_elements;
372 
373     if (idx >= (count * inner_elements))
374         return;
375 
376     uintptr_t x0 = res / inner_elements;
377     res %= inner_elements;
378     inner_elements /= md->u.blkhindx.count;
379 
380     uintptr_t x1 = res / inner_elements;
381     res %= inner_elements;
382     inner_elements /= md->u.blkhindx.blocklength;
383     uintptr_t x2 = res / inner_elements;
384     res %= inner_elements;
385     inner_elements /= md->u.blkhindx.child->u.hindexed.count;
386 
387     uintptr_t x3;
388     for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
389             uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
390                                  md->u.blkhindx.child->u.hindexed.child->num_elements;
391             if (res < in_elems) {
392                     x3 = i;
393                     res %= in_elems;
394                     inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
395                     break;
396             } else {
397                     res -= in_elems;
398             }
399     }
400 
401     uintptr_t x4 = res / inner_elements;
402     res %= inner_elements;
403     inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.hindexed.count;
404 
405     uintptr_t x5;
406     for (int i = 0; i < md->u.blkhindx.child->u.hindexed.child->u.hindexed.count; i++) {
407             uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
408                                  md->u.blkhindx.child->u.hindexed.child->u.hindexed.child->num_elements;
409             if (res < in_elems) {
410                     x5 = i;
411                     res %= in_elems;
412                     inner_elements = md->u.blkhindx.child->u.hindexed.child->u.hindexed.child->num_elements;
413                     break;
414             } else {
415                     res -= in_elems;
416             }
417     }
418 
419     uintptr_t x6 = res;
420 
421     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
422     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
423     uintptr_t extent2 = md->u.blkhindx.child->extent;
424     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.hindexed.child->u.hindexed.array_of_displs;
425     uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
426     *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
427 }
428 
yaksuri_cudai_unpack_blkhindx_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)429 void yaksuri_cudai_unpack_blkhindx_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
430 {
431 void *args[] = { &inbuf, &outbuf, &count, &md };
432     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_hindexed_hindexed_float,
433         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
434     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
435 }
436 
yaksuri_cudai_kernel_pack_hindexed_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)437 __global__ void yaksuri_cudai_kernel_pack_hindexed_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
438 {
439     const char *__restrict__ sbuf = (const char *) inbuf;
440     char *__restrict__ dbuf = (char *) outbuf;
441     uintptr_t extent = md->extent;
442     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
443     uintptr_t res = idx;
444     uintptr_t inner_elements = md->num_elements;
445 
446     if (idx >= (count * inner_elements))
447         return;
448 
449     uintptr_t x0 = res / inner_elements;
450     res %= inner_elements;
451     inner_elements /= md->u.hindexed.count;
452 
453     uintptr_t x1;
454     for (int i = 0; i < md->u.hindexed.count; i++) {
455             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
456                                  md->u.hindexed.child->num_elements;
457             if (res < in_elems) {
458                     x1 = i;
459                     res %= in_elems;
460                     inner_elements = md->u.hindexed.child->num_elements;
461                     break;
462             } else {
463                     res -= in_elems;
464             }
465     }
466 
467     uintptr_t x2 = res / inner_elements;
468     res %= inner_elements;
469     inner_elements /= md->u.hindexed.child->u.hindexed.count;
470 
471     uintptr_t x3;
472     for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
473             uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
474                                  md->u.hindexed.child->u.hindexed.child->num_elements;
475             if (res < in_elems) {
476                     x3 = i;
477                     res %= in_elems;
478                     inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
479                     break;
480             } else {
481                     res -= in_elems;
482             }
483     }
484 
485     uintptr_t x4 = res / inner_elements;
486     res %= inner_elements;
487     inner_elements /= md->u.hindexed.child->u.hindexed.child->u.hindexed.count;
488 
489     uintptr_t x5;
490     for (int i = 0; i < md->u.hindexed.child->u.hindexed.child->u.hindexed.count; i++) {
491             uintptr_t in_elems = md->u.hindexed.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
492                                  md->u.hindexed.child->u.hindexed.child->u.hindexed.child->num_elements;
493             if (res < in_elems) {
494                     x5 = i;
495                     res %= in_elems;
496                     inner_elements = md->u.hindexed.child->u.hindexed.child->u.hindexed.child->num_elements;
497                     break;
498             } else {
499                     res -= in_elems;
500             }
501     }
502 
503     uintptr_t x6 = res;
504 
505     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
506     intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
507     uintptr_t extent2 = md->u.hindexed.child->extent;
508     intptr_t *array_of_displs3 = md->u.hindexed.child->u.hindexed.child->u.hindexed.array_of_displs;
509     uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
510     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(float)));
511 }
512 
yaksuri_cudai_pack_hindexed_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)513 void yaksuri_cudai_pack_hindexed_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
514 {
515 void *args[] = { &inbuf, &outbuf, &count, &md };
516     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_hindexed_hindexed_float,
517         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
518     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
519 }
520 
yaksuri_cudai_kernel_unpack_hindexed_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)521 __global__ void yaksuri_cudai_kernel_unpack_hindexed_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
522 {
523     const char *__restrict__ sbuf = (const char *) inbuf;
524     char *__restrict__ dbuf = (char *) outbuf;
525     uintptr_t extent = md->extent;
526     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
527     uintptr_t res = idx;
528     uintptr_t inner_elements = md->num_elements;
529 
530     if (idx >= (count * inner_elements))
531         return;
532 
533     uintptr_t x0 = res / inner_elements;
534     res %= inner_elements;
535     inner_elements /= md->u.hindexed.count;
536 
537     uintptr_t x1;
538     for (int i = 0; i < md->u.hindexed.count; i++) {
539             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
540                                  md->u.hindexed.child->num_elements;
541             if (res < in_elems) {
542                     x1 = i;
543                     res %= in_elems;
544                     inner_elements = md->u.hindexed.child->num_elements;
545                     break;
546             } else {
547                     res -= in_elems;
548             }
549     }
550 
551     uintptr_t x2 = res / inner_elements;
552     res %= inner_elements;
553     inner_elements /= md->u.hindexed.child->u.hindexed.count;
554 
555     uintptr_t x3;
556     for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
557             uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
558                                  md->u.hindexed.child->u.hindexed.child->num_elements;
559             if (res < in_elems) {
560                     x3 = i;
561                     res %= in_elems;
562                     inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
563                     break;
564             } else {
565                     res -= in_elems;
566             }
567     }
568 
569     uintptr_t x4 = res / inner_elements;
570     res %= inner_elements;
571     inner_elements /= md->u.hindexed.child->u.hindexed.child->u.hindexed.count;
572 
573     uintptr_t x5;
574     for (int i = 0; i < md->u.hindexed.child->u.hindexed.child->u.hindexed.count; i++) {
575             uintptr_t in_elems = md->u.hindexed.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
576                                  md->u.hindexed.child->u.hindexed.child->u.hindexed.child->num_elements;
577             if (res < in_elems) {
578                     x5 = i;
579                     res %= in_elems;
580                     inner_elements = md->u.hindexed.child->u.hindexed.child->u.hindexed.child->num_elements;
581                     break;
582             } else {
583                     res -= in_elems;
584             }
585     }
586 
587     uintptr_t x6 = res;
588 
589     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
590     intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
591     uintptr_t extent2 = md->u.hindexed.child->extent;
592     intptr_t *array_of_displs3 = md->u.hindexed.child->u.hindexed.child->u.hindexed.array_of_displs;
593     uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
594     *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
595 }
596 
yaksuri_cudai_unpack_hindexed_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)597 void yaksuri_cudai_unpack_hindexed_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
598 {
599 void *args[] = { &inbuf, &outbuf, &count, &md };
600     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_hindexed_hindexed_float,
601         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
602     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
603 }
604 
yaksuri_cudai_kernel_pack_contig_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)605 __global__ void yaksuri_cudai_kernel_pack_contig_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
606 {
607     const char *__restrict__ sbuf = (const char *) inbuf;
608     char *__restrict__ dbuf = (char *) outbuf;
609     uintptr_t extent = md->extent;
610     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
611     uintptr_t res = idx;
612     uintptr_t inner_elements = md->num_elements;
613 
614     if (idx >= (count * inner_elements))
615         return;
616 
617     uintptr_t x0 = res / inner_elements;
618     res %= inner_elements;
619     inner_elements /= md->u.contig.count;
620 
621     uintptr_t x1 = res / inner_elements;
622     res %= inner_elements;
623     inner_elements /= md->u.contig.child->u.hindexed.count;
624 
625     uintptr_t x2;
626     for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
627             uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
628                                  md->u.contig.child->u.hindexed.child->num_elements;
629             if (res < in_elems) {
630                     x2 = i;
631                     res %= in_elems;
632                     inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
633                     break;
634             } else {
635                     res -= in_elems;
636             }
637     }
638 
639     uintptr_t x3 = res / inner_elements;
640     res %= inner_elements;
641     inner_elements /= md->u.contig.child->u.hindexed.child->u.hindexed.count;
642 
643     uintptr_t x4;
644     for (int i = 0; i < md->u.contig.child->u.hindexed.child->u.hindexed.count; i++) {
645             uintptr_t in_elems = md->u.contig.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
646                                  md->u.contig.child->u.hindexed.child->u.hindexed.child->num_elements;
647             if (res < in_elems) {
648                     x4 = i;
649                     res %= in_elems;
650                     inner_elements = md->u.contig.child->u.hindexed.child->u.hindexed.child->num_elements;
651                     break;
652             } else {
653                     res -= in_elems;
654             }
655     }
656 
657     uintptr_t x5 = res;
658 
659     intptr_t stride1 = md->u.contig.child->extent;
660     intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
661     intptr_t *array_of_displs3 = md->u.contig.child->u.hindexed.child->u.hindexed.array_of_displs;
662     uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
663     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(float)));
664 }
665 
yaksuri_cudai_pack_contig_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)666 void yaksuri_cudai_pack_contig_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
667 {
668 void *args[] = { &inbuf, &outbuf, &count, &md };
669     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_hindexed_hindexed_float,
670         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
671     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
672 }
673 
yaksuri_cudai_kernel_unpack_contig_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)674 __global__ void yaksuri_cudai_kernel_unpack_contig_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
675 {
676     const char *__restrict__ sbuf = (const char *) inbuf;
677     char *__restrict__ dbuf = (char *) outbuf;
678     uintptr_t extent = md->extent;
679     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
680     uintptr_t res = idx;
681     uintptr_t inner_elements = md->num_elements;
682 
683     if (idx >= (count * inner_elements))
684         return;
685 
686     uintptr_t x0 = res / inner_elements;
687     res %= inner_elements;
688     inner_elements /= md->u.contig.count;
689 
690     uintptr_t x1 = res / inner_elements;
691     res %= inner_elements;
692     inner_elements /= md->u.contig.child->u.hindexed.count;
693 
694     uintptr_t x2;
695     for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
696             uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
697                                  md->u.contig.child->u.hindexed.child->num_elements;
698             if (res < in_elems) {
699                     x2 = i;
700                     res %= in_elems;
701                     inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
702                     break;
703             } else {
704                     res -= in_elems;
705             }
706     }
707 
708     uintptr_t x3 = res / inner_elements;
709     res %= inner_elements;
710     inner_elements /= md->u.contig.child->u.hindexed.child->u.hindexed.count;
711 
712     uintptr_t x4;
713     for (int i = 0; i < md->u.contig.child->u.hindexed.child->u.hindexed.count; i++) {
714             uintptr_t in_elems = md->u.contig.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
715                                  md->u.contig.child->u.hindexed.child->u.hindexed.child->num_elements;
716             if (res < in_elems) {
717                     x4 = i;
718                     res %= in_elems;
719                     inner_elements = md->u.contig.child->u.hindexed.child->u.hindexed.child->num_elements;
720                     break;
721             } else {
722                     res -= in_elems;
723             }
724     }
725 
726     uintptr_t x5 = res;
727 
728     intptr_t stride1 = md->u.contig.child->extent;
729     intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
730     intptr_t *array_of_displs3 = md->u.contig.child->u.hindexed.child->u.hindexed.array_of_displs;
731     uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
732     *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
733 }
734 
yaksuri_cudai_unpack_contig_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)735 void yaksuri_cudai_unpack_contig_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
736 {
737 void *args[] = { &inbuf, &outbuf, &count, &md };
738     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_hindexed_hindexed_float,
739         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
740     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
741 }
742 
yaksuri_cudai_kernel_pack_resized_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)743 __global__ void yaksuri_cudai_kernel_pack_resized_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
744 {
745     const char *__restrict__ sbuf = (const char *) inbuf;
746     char *__restrict__ dbuf = (char *) outbuf;
747     uintptr_t extent = md->extent;
748     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
749     uintptr_t res = idx;
750     uintptr_t inner_elements = md->num_elements;
751 
752     if (idx >= (count * inner_elements))
753         return;
754 
755     uintptr_t x0 = res / inner_elements;
756     res %= inner_elements;
757     inner_elements /= md->u.resized.child->u.hindexed.count;
758 
759     uintptr_t x1;
760     for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
761             uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
762                                  md->u.resized.child->u.hindexed.child->num_elements;
763             if (res < in_elems) {
764                     x1 = i;
765                     res %= in_elems;
766                     inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
767                     break;
768             } else {
769                     res -= in_elems;
770             }
771     }
772 
773     uintptr_t x2 = res / inner_elements;
774     res %= inner_elements;
775     inner_elements /= md->u.resized.child->u.hindexed.child->u.hindexed.count;
776 
777     uintptr_t x3;
778     for (int i = 0; i < md->u.resized.child->u.hindexed.child->u.hindexed.count; i++) {
779             uintptr_t in_elems = md->u.resized.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
780                                  md->u.resized.child->u.hindexed.child->u.hindexed.child->num_elements;
781             if (res < in_elems) {
782                     x3 = i;
783                     res %= in_elems;
784                     inner_elements = md->u.resized.child->u.hindexed.child->u.hindexed.child->num_elements;
785                     break;
786             } else {
787                     res -= in_elems;
788             }
789     }
790 
791     uintptr_t x4 = res;
792 
793     intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
794     intptr_t *array_of_displs3 = md->u.resized.child->u.hindexed.child->u.hindexed.array_of_displs;
795     uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
796     *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(float)));
797 }
798 
yaksuri_cudai_pack_resized_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)799 void yaksuri_cudai_pack_resized_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
800 {
801 void *args[] = { &inbuf, &outbuf, &count, &md };
802     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_hindexed_hindexed_float,
803         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
804     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
805 }
806 
yaksuri_cudai_kernel_unpack_resized_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)807 __global__ void yaksuri_cudai_kernel_unpack_resized_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
808 {
809     const char *__restrict__ sbuf = (const char *) inbuf;
810     char *__restrict__ dbuf = (char *) outbuf;
811     uintptr_t extent = md->extent;
812     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
813     uintptr_t res = idx;
814     uintptr_t inner_elements = md->num_elements;
815 
816     if (idx >= (count * inner_elements))
817         return;
818 
819     uintptr_t x0 = res / inner_elements;
820     res %= inner_elements;
821     inner_elements /= md->u.resized.child->u.hindexed.count;
822 
823     uintptr_t x1;
824     for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
825             uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
826                                  md->u.resized.child->u.hindexed.child->num_elements;
827             if (res < in_elems) {
828                     x1 = i;
829                     res %= in_elems;
830                     inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
831                     break;
832             } else {
833                     res -= in_elems;
834             }
835     }
836 
837     uintptr_t x2 = res / inner_elements;
838     res %= inner_elements;
839     inner_elements /= md->u.resized.child->u.hindexed.child->u.hindexed.count;
840 
841     uintptr_t x3;
842     for (int i = 0; i < md->u.resized.child->u.hindexed.child->u.hindexed.count; i++) {
843             uintptr_t in_elems = md->u.resized.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
844                                  md->u.resized.child->u.hindexed.child->u.hindexed.child->num_elements;
845             if (res < in_elems) {
846                     x3 = i;
847                     res %= in_elems;
848                     inner_elements = md->u.resized.child->u.hindexed.child->u.hindexed.child->num_elements;
849                     break;
850             } else {
851                     res -= in_elems;
852             }
853     }
854 
855     uintptr_t x4 = res;
856 
857     intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
858     intptr_t *array_of_displs3 = md->u.resized.child->u.hindexed.child->u.hindexed.array_of_displs;
859     uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
860     *((float *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
861 }
862 
yaksuri_cudai_unpack_resized_hindexed_hindexed_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)863 void yaksuri_cudai_unpack_resized_hindexed_hindexed_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
864 {
865 void *args[] = { &inbuf, &outbuf, &count, &md };
866     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_hindexed_hindexed_float,
867         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
868     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
869 }
870 
871