1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.resized.child->u.hindexed.count;
32 
33     uintptr_t x1;
34     for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
35             uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
36                                  md->u.resized.child->u.hindexed.child->num_elements;
37             if (res < in_elems) {
38                     x1 = i;
39                     res %= in_elems;
40                     inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
41                     break;
42             } else {
43                     res -= in_elems;
44             }
45     }
46 
47     uintptr_t x2 = res;
48 
49     intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
50     *((int8_t *) (void *) (dbuf + idx * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * sizeof(int8_t)));
51 }
52 
yaksuri_cudai_pack_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)53 void yaksuri_cudai_pack_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
54 {
55 void *args[] = { &inbuf, &outbuf, &count, &md };
56     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_hindexed_int8_t,
57         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
58     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
59 }
60 
yaksuri_cudai_kernel_unpack_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)61 __global__ void yaksuri_cudai_kernel_unpack_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
62 {
63     const char *__restrict__ sbuf = (const char *) inbuf;
64     char *__restrict__ dbuf = (char *) outbuf;
65     uintptr_t extent = md->extent;
66     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
67     uintptr_t res = idx;
68     uintptr_t inner_elements = md->num_elements;
69 
70     if (idx >= (count * inner_elements))
71         return;
72 
73     uintptr_t x0 = res / inner_elements;
74     res %= inner_elements;
75     inner_elements /= md->u.resized.child->u.hindexed.count;
76 
77     uintptr_t x1;
78     for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
79             uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
80                                  md->u.resized.child->u.hindexed.child->num_elements;
81             if (res < in_elems) {
82                     x1 = i;
83                     res %= in_elems;
84                     inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
85                     break;
86             } else {
87                     res -= in_elems;
88             }
89     }
90 
91     uintptr_t x2 = res;
92 
93     intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
94     *((int8_t *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + idx * sizeof(int8_t)));
95 }
96 
yaksuri_cudai_unpack_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)97 void yaksuri_cudai_unpack_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
98 {
99 void *args[] = { &inbuf, &outbuf, &count, &md };
100     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_hindexed_int8_t,
101         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
102     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
103 }
104 
yaksuri_cudai_kernel_pack_hvector_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)105 __global__ void yaksuri_cudai_kernel_pack_hvector_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
106 {
107     const char *__restrict__ sbuf = (const char *) inbuf;
108     char *__restrict__ dbuf = (char *) outbuf;
109     uintptr_t extent = md->extent;
110     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
111     uintptr_t res = idx;
112     uintptr_t inner_elements = md->num_elements;
113 
114     if (idx >= (count * inner_elements))
115         return;
116 
117     uintptr_t x0 = res / inner_elements;
118     res %= inner_elements;
119     inner_elements /= md->u.hvector.count;
120 
121     uintptr_t x1 = res / inner_elements;
122     res %= inner_elements;
123     inner_elements /= md->u.hvector.blocklength;
124     uintptr_t x2 = res / inner_elements;
125     res %= inner_elements;
126     inner_elements /= md->u.hvector.child->u.resized.child->u.hindexed.count;
127 
128     uintptr_t x3;
129     for (int i = 0; i < md->u.hvector.child->u.resized.child->u.hindexed.count; i++) {
130             uintptr_t in_elems = md->u.hvector.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
131                                  md->u.hvector.child->u.resized.child->u.hindexed.child->num_elements;
132             if (res < in_elems) {
133                     x3 = i;
134                     res %= in_elems;
135                     inner_elements = md->u.hvector.child->u.resized.child->u.hindexed.child->num_elements;
136                     break;
137             } else {
138                     res -= in_elems;
139             }
140     }
141 
142     uintptr_t x4 = res;
143 
144     intptr_t stride1 = md->u.hvector.stride;
145     uintptr_t extent2 = md->u.hvector.child->extent;
146     intptr_t *array_of_displs3 = md->u.hvector.child->u.resized.child->u.hindexed.array_of_displs;
147     *((int8_t *) (void *) (dbuf + idx * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int8_t)));
148 }
149 
yaksuri_cudai_pack_hvector_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)150 void yaksuri_cudai_pack_hvector_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
151 {
152 void *args[] = { &inbuf, &outbuf, &count, &md };
153     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_resized_hindexed_int8_t,
154         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
155     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
156 }
157 
yaksuri_cudai_kernel_unpack_hvector_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)158 __global__ void yaksuri_cudai_kernel_unpack_hvector_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
159 {
160     const char *__restrict__ sbuf = (const char *) inbuf;
161     char *__restrict__ dbuf = (char *) outbuf;
162     uintptr_t extent = md->extent;
163     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
164     uintptr_t res = idx;
165     uintptr_t inner_elements = md->num_elements;
166 
167     if (idx >= (count * inner_elements))
168         return;
169 
170     uintptr_t x0 = res / inner_elements;
171     res %= inner_elements;
172     inner_elements /= md->u.hvector.count;
173 
174     uintptr_t x1 = res / inner_elements;
175     res %= inner_elements;
176     inner_elements /= md->u.hvector.blocklength;
177     uintptr_t x2 = res / inner_elements;
178     res %= inner_elements;
179     inner_elements /= md->u.hvector.child->u.resized.child->u.hindexed.count;
180 
181     uintptr_t x3;
182     for (int i = 0; i < md->u.hvector.child->u.resized.child->u.hindexed.count; i++) {
183             uintptr_t in_elems = md->u.hvector.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
184                                  md->u.hvector.child->u.resized.child->u.hindexed.child->num_elements;
185             if (res < in_elems) {
186                     x3 = i;
187                     res %= in_elems;
188                     inner_elements = md->u.hvector.child->u.resized.child->u.hindexed.child->num_elements;
189                     break;
190             } else {
191                     res -= in_elems;
192             }
193     }
194 
195     uintptr_t x4 = res;
196 
197     intptr_t stride1 = md->u.hvector.stride;
198     uintptr_t extent2 = md->u.hvector.child->extent;
199     intptr_t *array_of_displs3 = md->u.hvector.child->u.resized.child->u.hindexed.array_of_displs;
200     *((int8_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + idx * sizeof(int8_t)));
201 }
202 
yaksuri_cudai_unpack_hvector_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)203 void yaksuri_cudai_unpack_hvector_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
204 {
205 void *args[] = { &inbuf, &outbuf, &count, &md };
206     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_resized_hindexed_int8_t,
207         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
208     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
209 }
210 
yaksuri_cudai_kernel_pack_blkhindx_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)211 __global__ void yaksuri_cudai_kernel_pack_blkhindx_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
212 {
213     const char *__restrict__ sbuf = (const char *) inbuf;
214     char *__restrict__ dbuf = (char *) outbuf;
215     uintptr_t extent = md->extent;
216     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
217     uintptr_t res = idx;
218     uintptr_t inner_elements = md->num_elements;
219 
220     if (idx >= (count * inner_elements))
221         return;
222 
223     uintptr_t x0 = res / inner_elements;
224     res %= inner_elements;
225     inner_elements /= md->u.blkhindx.count;
226 
227     uintptr_t x1 = res / inner_elements;
228     res %= inner_elements;
229     inner_elements /= md->u.blkhindx.blocklength;
230     uintptr_t x2 = res / inner_elements;
231     res %= inner_elements;
232     inner_elements /= md->u.blkhindx.child->u.resized.child->u.hindexed.count;
233 
234     uintptr_t x3;
235     for (int i = 0; i < md->u.blkhindx.child->u.resized.child->u.hindexed.count; i++) {
236             uintptr_t in_elems = md->u.blkhindx.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
237                                  md->u.blkhindx.child->u.resized.child->u.hindexed.child->num_elements;
238             if (res < in_elems) {
239                     x3 = i;
240                     res %= in_elems;
241                     inner_elements = md->u.blkhindx.child->u.resized.child->u.hindexed.child->num_elements;
242                     break;
243             } else {
244                     res -= in_elems;
245             }
246     }
247 
248     uintptr_t x4 = res;
249 
250     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
251     uintptr_t extent2 = md->u.blkhindx.child->extent;
252     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.resized.child->u.hindexed.array_of_displs;
253     *((int8_t *) (void *) (dbuf + idx * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int8_t)));
254 }
255 
yaksuri_cudai_pack_blkhindx_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)256 void yaksuri_cudai_pack_blkhindx_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
257 {
258 void *args[] = { &inbuf, &outbuf, &count, &md };
259     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_resized_hindexed_int8_t,
260         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
261     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
262 }
263 
yaksuri_cudai_kernel_unpack_blkhindx_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)264 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
265 {
266     const char *__restrict__ sbuf = (const char *) inbuf;
267     char *__restrict__ dbuf = (char *) outbuf;
268     uintptr_t extent = md->extent;
269     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
270     uintptr_t res = idx;
271     uintptr_t inner_elements = md->num_elements;
272 
273     if (idx >= (count * inner_elements))
274         return;
275 
276     uintptr_t x0 = res / inner_elements;
277     res %= inner_elements;
278     inner_elements /= md->u.blkhindx.count;
279 
280     uintptr_t x1 = res / inner_elements;
281     res %= inner_elements;
282     inner_elements /= md->u.blkhindx.blocklength;
283     uintptr_t x2 = res / inner_elements;
284     res %= inner_elements;
285     inner_elements /= md->u.blkhindx.child->u.resized.child->u.hindexed.count;
286 
287     uintptr_t x3;
288     for (int i = 0; i < md->u.blkhindx.child->u.resized.child->u.hindexed.count; i++) {
289             uintptr_t in_elems = md->u.blkhindx.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
290                                  md->u.blkhindx.child->u.resized.child->u.hindexed.child->num_elements;
291             if (res < in_elems) {
292                     x3 = i;
293                     res %= in_elems;
294                     inner_elements = md->u.blkhindx.child->u.resized.child->u.hindexed.child->num_elements;
295                     break;
296             } else {
297                     res -= in_elems;
298             }
299     }
300 
301     uintptr_t x4 = res;
302 
303     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
304     uintptr_t extent2 = md->u.blkhindx.child->extent;
305     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.resized.child->u.hindexed.array_of_displs;
306     *((int8_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + idx * sizeof(int8_t)));
307 }
308 
yaksuri_cudai_unpack_blkhindx_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)309 void yaksuri_cudai_unpack_blkhindx_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
310 {
311 void *args[] = { &inbuf, &outbuf, &count, &md };
312     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_resized_hindexed_int8_t,
313         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
314     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
315 }
316 
yaksuri_cudai_kernel_pack_hindexed_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)317 __global__ void yaksuri_cudai_kernel_pack_hindexed_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
318 {
319     const char *__restrict__ sbuf = (const char *) inbuf;
320     char *__restrict__ dbuf = (char *) outbuf;
321     uintptr_t extent = md->extent;
322     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
323     uintptr_t res = idx;
324     uintptr_t inner_elements = md->num_elements;
325 
326     if (idx >= (count * inner_elements))
327         return;
328 
329     uintptr_t x0 = res / inner_elements;
330     res %= inner_elements;
331     inner_elements /= md->u.hindexed.count;
332 
333     uintptr_t x1;
334     for (int i = 0; i < md->u.hindexed.count; i++) {
335             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
336                                  md->u.hindexed.child->num_elements;
337             if (res < in_elems) {
338                     x1 = i;
339                     res %= in_elems;
340                     inner_elements = md->u.hindexed.child->num_elements;
341                     break;
342             } else {
343                     res -= in_elems;
344             }
345     }
346 
347     uintptr_t x2 = res / inner_elements;
348     res %= inner_elements;
349     inner_elements /= md->u.hindexed.child->u.resized.child->u.hindexed.count;
350 
351     uintptr_t x3;
352     for (int i = 0; i < md->u.hindexed.child->u.resized.child->u.hindexed.count; i++) {
353             uintptr_t in_elems = md->u.hindexed.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
354                                  md->u.hindexed.child->u.resized.child->u.hindexed.child->num_elements;
355             if (res < in_elems) {
356                     x3 = i;
357                     res %= in_elems;
358                     inner_elements = md->u.hindexed.child->u.resized.child->u.hindexed.child->num_elements;
359                     break;
360             } else {
361                     res -= in_elems;
362             }
363     }
364 
365     uintptr_t x4 = res;
366 
367     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
368     uintptr_t extent2 = md->u.hindexed.child->extent;
369     intptr_t *array_of_displs3 = md->u.hindexed.child->u.resized.child->u.hindexed.array_of_displs;
370     *((int8_t *) (void *) (dbuf + idx * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int8_t)));
371 }
372 
yaksuri_cudai_pack_hindexed_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)373 void yaksuri_cudai_pack_hindexed_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
374 {
375 void *args[] = { &inbuf, &outbuf, &count, &md };
376     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_resized_hindexed_int8_t,
377         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
378     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
379 }
380 
yaksuri_cudai_kernel_unpack_hindexed_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)381 __global__ void yaksuri_cudai_kernel_unpack_hindexed_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
382 {
383     const char *__restrict__ sbuf = (const char *) inbuf;
384     char *__restrict__ dbuf = (char *) outbuf;
385     uintptr_t extent = md->extent;
386     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
387     uintptr_t res = idx;
388     uintptr_t inner_elements = md->num_elements;
389 
390     if (idx >= (count * inner_elements))
391         return;
392 
393     uintptr_t x0 = res / inner_elements;
394     res %= inner_elements;
395     inner_elements /= md->u.hindexed.count;
396 
397     uintptr_t x1;
398     for (int i = 0; i < md->u.hindexed.count; i++) {
399             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
400                                  md->u.hindexed.child->num_elements;
401             if (res < in_elems) {
402                     x1 = i;
403                     res %= in_elems;
404                     inner_elements = md->u.hindexed.child->num_elements;
405                     break;
406             } else {
407                     res -= in_elems;
408             }
409     }
410 
411     uintptr_t x2 = res / inner_elements;
412     res %= inner_elements;
413     inner_elements /= md->u.hindexed.child->u.resized.child->u.hindexed.count;
414 
415     uintptr_t x3;
416     for (int i = 0; i < md->u.hindexed.child->u.resized.child->u.hindexed.count; i++) {
417             uintptr_t in_elems = md->u.hindexed.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
418                                  md->u.hindexed.child->u.resized.child->u.hindexed.child->num_elements;
419             if (res < in_elems) {
420                     x3 = i;
421                     res %= in_elems;
422                     inner_elements = md->u.hindexed.child->u.resized.child->u.hindexed.child->num_elements;
423                     break;
424             } else {
425                     res -= in_elems;
426             }
427     }
428 
429     uintptr_t x4 = res;
430 
431     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
432     uintptr_t extent2 = md->u.hindexed.child->extent;
433     intptr_t *array_of_displs3 = md->u.hindexed.child->u.resized.child->u.hindexed.array_of_displs;
434     *((int8_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + idx * sizeof(int8_t)));
435 }
436 
yaksuri_cudai_unpack_hindexed_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)437 void yaksuri_cudai_unpack_hindexed_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
438 {
439 void *args[] = { &inbuf, &outbuf, &count, &md };
440     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_resized_hindexed_int8_t,
441         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
442     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
443 }
444 
yaksuri_cudai_kernel_pack_contig_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)445 __global__ void yaksuri_cudai_kernel_pack_contig_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
446 {
447     const char *__restrict__ sbuf = (const char *) inbuf;
448     char *__restrict__ dbuf = (char *) outbuf;
449     uintptr_t extent = md->extent;
450     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
451     uintptr_t res = idx;
452     uintptr_t inner_elements = md->num_elements;
453 
454     if (idx >= (count * inner_elements))
455         return;
456 
457     uintptr_t x0 = res / inner_elements;
458     res %= inner_elements;
459     inner_elements /= md->u.contig.count;
460 
461     uintptr_t x1 = res / inner_elements;
462     res %= inner_elements;
463     inner_elements /= md->u.contig.child->u.resized.child->u.hindexed.count;
464 
465     uintptr_t x2;
466     for (int i = 0; i < md->u.contig.child->u.resized.child->u.hindexed.count; i++) {
467             uintptr_t in_elems = md->u.contig.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
468                                  md->u.contig.child->u.resized.child->u.hindexed.child->num_elements;
469             if (res < in_elems) {
470                     x2 = i;
471                     res %= in_elems;
472                     inner_elements = md->u.contig.child->u.resized.child->u.hindexed.child->num_elements;
473                     break;
474             } else {
475                     res -= in_elems;
476             }
477     }
478 
479     uintptr_t x3 = res;
480 
481     intptr_t stride1 = md->u.contig.child->extent;
482     intptr_t *array_of_displs3 = md->u.contig.child->u.resized.child->u.hindexed.array_of_displs;
483     *((int8_t *) (void *) (dbuf + idx * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs3[x2] + x3 * sizeof(int8_t)));
484 }
485 
yaksuri_cudai_pack_contig_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)486 void yaksuri_cudai_pack_contig_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
487 {
488 void *args[] = { &inbuf, &outbuf, &count, &md };
489     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_resized_hindexed_int8_t,
490         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
491     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
492 }
493 
yaksuri_cudai_kernel_unpack_contig_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)494 __global__ void yaksuri_cudai_kernel_unpack_contig_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
495 {
496     const char *__restrict__ sbuf = (const char *) inbuf;
497     char *__restrict__ dbuf = (char *) outbuf;
498     uintptr_t extent = md->extent;
499     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
500     uintptr_t res = idx;
501     uintptr_t inner_elements = md->num_elements;
502 
503     if (idx >= (count * inner_elements))
504         return;
505 
506     uintptr_t x0 = res / inner_elements;
507     res %= inner_elements;
508     inner_elements /= md->u.contig.count;
509 
510     uintptr_t x1 = res / inner_elements;
511     res %= inner_elements;
512     inner_elements /= md->u.contig.child->u.resized.child->u.hindexed.count;
513 
514     uintptr_t x2;
515     for (int i = 0; i < md->u.contig.child->u.resized.child->u.hindexed.count; i++) {
516             uintptr_t in_elems = md->u.contig.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
517                                  md->u.contig.child->u.resized.child->u.hindexed.child->num_elements;
518             if (res < in_elems) {
519                     x2 = i;
520                     res %= in_elems;
521                     inner_elements = md->u.contig.child->u.resized.child->u.hindexed.child->num_elements;
522                     break;
523             } else {
524                     res -= in_elems;
525             }
526     }
527 
528     uintptr_t x3 = res;
529 
530     intptr_t stride1 = md->u.contig.child->extent;
531     intptr_t *array_of_displs3 = md->u.contig.child->u.resized.child->u.hindexed.array_of_displs;
532     *((int8_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs3[x2] + x3 * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + idx * sizeof(int8_t)));
533 }
534 
yaksuri_cudai_unpack_contig_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)535 void yaksuri_cudai_unpack_contig_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
536 {
537 void *args[] = { &inbuf, &outbuf, &count, &md };
538     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_resized_hindexed_int8_t,
539         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
540     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
541 }
542 
yaksuri_cudai_kernel_pack_resized_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)543 __global__ void yaksuri_cudai_kernel_pack_resized_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
544 {
545     const char *__restrict__ sbuf = (const char *) inbuf;
546     char *__restrict__ dbuf = (char *) outbuf;
547     uintptr_t extent = md->extent;
548     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
549     uintptr_t res = idx;
550     uintptr_t inner_elements = md->num_elements;
551 
552     if (idx >= (count * inner_elements))
553         return;
554 
555     uintptr_t x0 = res / inner_elements;
556     res %= inner_elements;
557     inner_elements /= md->u.resized.child->u.resized.child->u.hindexed.count;
558 
559     uintptr_t x1;
560     for (int i = 0; i < md->u.resized.child->u.resized.child->u.hindexed.count; i++) {
561             uintptr_t in_elems = md->u.resized.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
562                                  md->u.resized.child->u.resized.child->u.hindexed.child->num_elements;
563             if (res < in_elems) {
564                     x1 = i;
565                     res %= in_elems;
566                     inner_elements = md->u.resized.child->u.resized.child->u.hindexed.child->num_elements;
567                     break;
568             } else {
569                     res -= in_elems;
570             }
571     }
572 
573     uintptr_t x2 = res;
574 
575     intptr_t *array_of_displs3 = md->u.resized.child->u.resized.child->u.hindexed.array_of_displs;
576     *((int8_t *) (void *) (dbuf + idx * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + x0 * extent + array_of_displs3[x1] + x2 * sizeof(int8_t)));
577 }
578 
yaksuri_cudai_pack_resized_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)579 void yaksuri_cudai_pack_resized_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
580 {
581 void *args[] = { &inbuf, &outbuf, &count, &md };
582     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_resized_hindexed_int8_t,
583         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
584     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
585 }
586 
yaksuri_cudai_kernel_unpack_resized_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)587 __global__ void yaksuri_cudai_kernel_unpack_resized_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
588 {
589     const char *__restrict__ sbuf = (const char *) inbuf;
590     char *__restrict__ dbuf = (char *) outbuf;
591     uintptr_t extent = md->extent;
592     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
593     uintptr_t res = idx;
594     uintptr_t inner_elements = md->num_elements;
595 
596     if (idx >= (count * inner_elements))
597         return;
598 
599     uintptr_t x0 = res / inner_elements;
600     res %= inner_elements;
601     inner_elements /= md->u.resized.child->u.resized.child->u.hindexed.count;
602 
603     uintptr_t x1;
604     for (int i = 0; i < md->u.resized.child->u.resized.child->u.hindexed.count; i++) {
605             uintptr_t in_elems = md->u.resized.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
606                                  md->u.resized.child->u.resized.child->u.hindexed.child->num_elements;
607             if (res < in_elems) {
608                     x1 = i;
609                     res %= in_elems;
610                     inner_elements = md->u.resized.child->u.resized.child->u.hindexed.child->num_elements;
611                     break;
612             } else {
613                     res -= in_elems;
614             }
615     }
616 
617     uintptr_t x2 = res;
618 
619     intptr_t *array_of_displs3 = md->u.resized.child->u.resized.child->u.hindexed.array_of_displs;
620     *((int8_t *) (void *) (dbuf + x0 * extent + array_of_displs3[x1] + x2 * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + idx * sizeof(int8_t)));
621 }
622 
yaksuri_cudai_unpack_resized_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)623 void yaksuri_cudai_unpack_resized_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
624 {
625 void *args[] = { &inbuf, &outbuf, &count, &md };
626     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_resized_hindexed_int8_t,
627         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
628     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
629 }
630 
631