1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.blkhindx.count;
32 
33     uintptr_t x1 = res / inner_elements;
34     res %= inner_elements;
35     inner_elements /= md->u.blkhindx.blocklength;
36     uintptr_t x2 = res / inner_elements;
37     res %= inner_elements;
38     inner_elements /= md->u.blkhindx.child->u.contig.count;
39 
40     uintptr_t x3 = res;
41 
42     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
43     intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
44     uintptr_t extent2 = md->u.blkhindx.child->extent;
45     *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2));
46 }
47 
yaksuri_cudai_pack_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)48 void yaksuri_cudai_pack_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
49 {
50 void *args[] = { &inbuf, &outbuf, &count, &md };
51     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_contig__Bool,
52         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
53     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
54 }
55 
yaksuri_cudai_kernel_unpack_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)56 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
57 {
58     const char *__restrict__ sbuf = (const char *) inbuf;
59     char *__restrict__ dbuf = (char *) outbuf;
60     uintptr_t extent = md->extent;
61     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
62     uintptr_t res = idx;
63     uintptr_t inner_elements = md->num_elements;
64 
65     if (idx >= (count * inner_elements))
66         return;
67 
68     uintptr_t x0 = res / inner_elements;
69     res %= inner_elements;
70     inner_elements /= md->u.blkhindx.count;
71 
72     uintptr_t x1 = res / inner_elements;
73     res %= inner_elements;
74     inner_elements /= md->u.blkhindx.blocklength;
75     uintptr_t x2 = res / inner_elements;
76     res %= inner_elements;
77     inner_elements /= md->u.blkhindx.child->u.contig.count;
78 
79     uintptr_t x3 = res;
80 
81     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
82     intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
83     uintptr_t extent2 = md->u.blkhindx.child->extent;
84     *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
85 }
86 
yaksuri_cudai_unpack_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)87 void yaksuri_cudai_unpack_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
88 {
89 void *args[] = { &inbuf, &outbuf, &count, &md };
90     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_contig__Bool,
91         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
92     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
93 }
94 
yaksuri_cudai_kernel_pack_hvector_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)95 __global__ void yaksuri_cudai_kernel_pack_hvector_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
96 {
97     const char *__restrict__ sbuf = (const char *) inbuf;
98     char *__restrict__ dbuf = (char *) outbuf;
99     uintptr_t extent = md->extent;
100     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
101     uintptr_t res = idx;
102     uintptr_t inner_elements = md->num_elements;
103 
104     if (idx >= (count * inner_elements))
105         return;
106 
107     uintptr_t x0 = res / inner_elements;
108     res %= inner_elements;
109     inner_elements /= md->u.hvector.count;
110 
111     uintptr_t x1 = res / inner_elements;
112     res %= inner_elements;
113     inner_elements /= md->u.hvector.blocklength;
114     uintptr_t x2 = res / inner_elements;
115     res %= inner_elements;
116     inner_elements /= md->u.hvector.child->u.blkhindx.count;
117 
118     uintptr_t x3 = res / inner_elements;
119     res %= inner_elements;
120     inner_elements /= md->u.hvector.child->u.blkhindx.blocklength;
121     uintptr_t x4 = res / inner_elements;
122     res %= inner_elements;
123     inner_elements /= md->u.hvector.child->u.blkhindx.child->u.contig.count;
124 
125     uintptr_t x5 = res;
126 
127     intptr_t stride1 = md->u.hvector.stride;
128     intptr_t *array_of_displs2 = md->u.hvector.child->u.blkhindx.array_of_displs;
129     uintptr_t extent2 = md->u.hvector.child->extent;
130     intptr_t stride3 = md->u.hvector.child->u.blkhindx.child->u.contig.child->extent;
131     uintptr_t extent3 = md->u.hvector.child->u.blkhindx.child->extent;
132     *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3));
133 }
134 
yaksuri_cudai_pack_hvector_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)135 void yaksuri_cudai_pack_hvector_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
136 {
137 void *args[] = { &inbuf, &outbuf, &count, &md };
138     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_blkhindx_contig__Bool,
139         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
140     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
141 }
142 
yaksuri_cudai_kernel_unpack_hvector_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)143 __global__ void yaksuri_cudai_kernel_unpack_hvector_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
144 {
145     const char *__restrict__ sbuf = (const char *) inbuf;
146     char *__restrict__ dbuf = (char *) outbuf;
147     uintptr_t extent = md->extent;
148     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
149     uintptr_t res = idx;
150     uintptr_t inner_elements = md->num_elements;
151 
152     if (idx >= (count * inner_elements))
153         return;
154 
155     uintptr_t x0 = res / inner_elements;
156     res %= inner_elements;
157     inner_elements /= md->u.hvector.count;
158 
159     uintptr_t x1 = res / inner_elements;
160     res %= inner_elements;
161     inner_elements /= md->u.hvector.blocklength;
162     uintptr_t x2 = res / inner_elements;
163     res %= inner_elements;
164     inner_elements /= md->u.hvector.child->u.blkhindx.count;
165 
166     uintptr_t x3 = res / inner_elements;
167     res %= inner_elements;
168     inner_elements /= md->u.hvector.child->u.blkhindx.blocklength;
169     uintptr_t x4 = res / inner_elements;
170     res %= inner_elements;
171     inner_elements /= md->u.hvector.child->u.blkhindx.child->u.contig.count;
172 
173     uintptr_t x5 = res;
174 
175     intptr_t stride1 = md->u.hvector.stride;
176     intptr_t *array_of_displs2 = md->u.hvector.child->u.blkhindx.array_of_displs;
177     uintptr_t extent2 = md->u.hvector.child->extent;
178     intptr_t stride3 = md->u.hvector.child->u.blkhindx.child->u.contig.child->extent;
179     uintptr_t extent3 = md->u.hvector.child->u.blkhindx.child->extent;
180     *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
181 }
182 
yaksuri_cudai_unpack_hvector_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)183 void yaksuri_cudai_unpack_hvector_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
184 {
185 void *args[] = { &inbuf, &outbuf, &count, &md };
186     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_blkhindx_contig__Bool,
187         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
188     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
189 }
190 
yaksuri_cudai_kernel_pack_blkhindx_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)191 __global__ void yaksuri_cudai_kernel_pack_blkhindx_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
192 {
193     const char *__restrict__ sbuf = (const char *) inbuf;
194     char *__restrict__ dbuf = (char *) outbuf;
195     uintptr_t extent = md->extent;
196     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
197     uintptr_t res = idx;
198     uintptr_t inner_elements = md->num_elements;
199 
200     if (idx >= (count * inner_elements))
201         return;
202 
203     uintptr_t x0 = res / inner_elements;
204     res %= inner_elements;
205     inner_elements /= md->u.blkhindx.count;
206 
207     uintptr_t x1 = res / inner_elements;
208     res %= inner_elements;
209     inner_elements /= md->u.blkhindx.blocklength;
210     uintptr_t x2 = res / inner_elements;
211     res %= inner_elements;
212     inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
213 
214     uintptr_t x3 = res / inner_elements;
215     res %= inner_elements;
216     inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
217     uintptr_t x4 = res / inner_elements;
218     res %= inner_elements;
219     inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.contig.count;
220 
221     uintptr_t x5 = res;
222 
223     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
224     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
225     uintptr_t extent2 = md->u.blkhindx.child->extent;
226     intptr_t stride3 = md->u.blkhindx.child->u.blkhindx.child->u.contig.child->extent;
227     uintptr_t extent3 = md->u.blkhindx.child->u.blkhindx.child->extent;
228     *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3));
229 }
230 
yaksuri_cudai_pack_blkhindx_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)231 void yaksuri_cudai_pack_blkhindx_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
232 {
233 void *args[] = { &inbuf, &outbuf, &count, &md };
234     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_blkhindx_contig__Bool,
235         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
236     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
237 }
238 
yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)239 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
240 {
241     const char *__restrict__ sbuf = (const char *) inbuf;
242     char *__restrict__ dbuf = (char *) outbuf;
243     uintptr_t extent = md->extent;
244     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
245     uintptr_t res = idx;
246     uintptr_t inner_elements = md->num_elements;
247 
248     if (idx >= (count * inner_elements))
249         return;
250 
251     uintptr_t x0 = res / inner_elements;
252     res %= inner_elements;
253     inner_elements /= md->u.blkhindx.count;
254 
255     uintptr_t x1 = res / inner_elements;
256     res %= inner_elements;
257     inner_elements /= md->u.blkhindx.blocklength;
258     uintptr_t x2 = res / inner_elements;
259     res %= inner_elements;
260     inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
261 
262     uintptr_t x3 = res / inner_elements;
263     res %= inner_elements;
264     inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
265     uintptr_t x4 = res / inner_elements;
266     res %= inner_elements;
267     inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.contig.count;
268 
269     uintptr_t x5 = res;
270 
271     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
272     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
273     uintptr_t extent2 = md->u.blkhindx.child->extent;
274     intptr_t stride3 = md->u.blkhindx.child->u.blkhindx.child->u.contig.child->extent;
275     uintptr_t extent3 = md->u.blkhindx.child->u.blkhindx.child->extent;
276     *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
277 }
278 
yaksuri_cudai_unpack_blkhindx_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)279 void yaksuri_cudai_unpack_blkhindx_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
280 {
281 void *args[] = { &inbuf, &outbuf, &count, &md };
282     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_contig__Bool,
283         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
284     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
285 }
286 
yaksuri_cudai_kernel_pack_hindexed_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)287 __global__ void yaksuri_cudai_kernel_pack_hindexed_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
288 {
289     const char *__restrict__ sbuf = (const char *) inbuf;
290     char *__restrict__ dbuf = (char *) outbuf;
291     uintptr_t extent = md->extent;
292     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
293     uintptr_t res = idx;
294     uintptr_t inner_elements = md->num_elements;
295 
296     if (idx >= (count * inner_elements))
297         return;
298 
299     uintptr_t x0 = res / inner_elements;
300     res %= inner_elements;
301     inner_elements /= md->u.hindexed.count;
302 
303     uintptr_t x1;
304     for (int i = 0; i < md->u.hindexed.count; i++) {
305             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
306                                  md->u.hindexed.child->num_elements;
307             if (res < in_elems) {
308                     x1 = i;
309                     res %= in_elems;
310                     inner_elements = md->u.hindexed.child->num_elements;
311                     break;
312             } else {
313                     res -= in_elems;
314             }
315     }
316 
317     uintptr_t x2 = res / inner_elements;
318     res %= inner_elements;
319     inner_elements /= md->u.hindexed.child->u.blkhindx.count;
320 
321     uintptr_t x3 = res / inner_elements;
322     res %= inner_elements;
323     inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
324     uintptr_t x4 = res / inner_elements;
325     res %= inner_elements;
326     inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.contig.count;
327 
328     uintptr_t x5 = res;
329 
330     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
331     intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
332     uintptr_t extent2 = md->u.hindexed.child->extent;
333     intptr_t stride3 = md->u.hindexed.child->u.blkhindx.child->u.contig.child->extent;
334     uintptr_t extent3 = md->u.hindexed.child->u.blkhindx.child->extent;
335     *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3));
336 }
337 
yaksuri_cudai_pack_hindexed_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)338 void yaksuri_cudai_pack_hindexed_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
339 {
340 void *args[] = { &inbuf, &outbuf, &count, &md };
341     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_blkhindx_contig__Bool,
342         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
343     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
344 }
345 
yaksuri_cudai_kernel_unpack_hindexed_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)346 __global__ void yaksuri_cudai_kernel_unpack_hindexed_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
347 {
348     const char *__restrict__ sbuf = (const char *) inbuf;
349     char *__restrict__ dbuf = (char *) outbuf;
350     uintptr_t extent = md->extent;
351     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
352     uintptr_t res = idx;
353     uintptr_t inner_elements = md->num_elements;
354 
355     if (idx >= (count * inner_elements))
356         return;
357 
358     uintptr_t x0 = res / inner_elements;
359     res %= inner_elements;
360     inner_elements /= md->u.hindexed.count;
361 
362     uintptr_t x1;
363     for (int i = 0; i < md->u.hindexed.count; i++) {
364             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
365                                  md->u.hindexed.child->num_elements;
366             if (res < in_elems) {
367                     x1 = i;
368                     res %= in_elems;
369                     inner_elements = md->u.hindexed.child->num_elements;
370                     break;
371             } else {
372                     res -= in_elems;
373             }
374     }
375 
376     uintptr_t x2 = res / inner_elements;
377     res %= inner_elements;
378     inner_elements /= md->u.hindexed.child->u.blkhindx.count;
379 
380     uintptr_t x3 = res / inner_elements;
381     res %= inner_elements;
382     inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
383     uintptr_t x4 = res / inner_elements;
384     res %= inner_elements;
385     inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.contig.count;
386 
387     uintptr_t x5 = res;
388 
389     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
390     intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
391     uintptr_t extent2 = md->u.hindexed.child->extent;
392     intptr_t stride3 = md->u.hindexed.child->u.blkhindx.child->u.contig.child->extent;
393     uintptr_t extent3 = md->u.hindexed.child->u.blkhindx.child->extent;
394     *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
395 }
396 
yaksuri_cudai_unpack_hindexed_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)397 void yaksuri_cudai_unpack_hindexed_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
398 {
399 void *args[] = { &inbuf, &outbuf, &count, &md };
400     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_blkhindx_contig__Bool,
401         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
402     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
403 }
404 
yaksuri_cudai_kernel_pack_contig_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)405 __global__ void yaksuri_cudai_kernel_pack_contig_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
406 {
407     const char *__restrict__ sbuf = (const char *) inbuf;
408     char *__restrict__ dbuf = (char *) outbuf;
409     uintptr_t extent = md->extent;
410     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
411     uintptr_t res = idx;
412     uintptr_t inner_elements = md->num_elements;
413 
414     if (idx >= (count * inner_elements))
415         return;
416 
417     uintptr_t x0 = res / inner_elements;
418     res %= inner_elements;
419     inner_elements /= md->u.contig.count;
420 
421     uintptr_t x1 = res / inner_elements;
422     res %= inner_elements;
423     inner_elements /= md->u.contig.child->u.blkhindx.count;
424 
425     uintptr_t x2 = res / inner_elements;
426     res %= inner_elements;
427     inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
428     uintptr_t x3 = res / inner_elements;
429     res %= inner_elements;
430     inner_elements /= md->u.contig.child->u.blkhindx.child->u.contig.count;
431 
432     uintptr_t x4 = res;
433 
434     intptr_t stride1 = md->u.contig.child->extent;
435     intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
436     intptr_t stride3 = md->u.contig.child->u.blkhindx.child->u.contig.child->extent;
437     uintptr_t extent3 = md->u.contig.child->u.blkhindx.child->extent;
438     *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + x4 * stride3));
439 }
440 
yaksuri_cudai_pack_contig_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)441 void yaksuri_cudai_pack_contig_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
442 {
443 void *args[] = { &inbuf, &outbuf, &count, &md };
444     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_blkhindx_contig__Bool,
445         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
446     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
447 }
448 
yaksuri_cudai_kernel_unpack_contig_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)449 __global__ void yaksuri_cudai_kernel_unpack_contig_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
450 {
451     const char *__restrict__ sbuf = (const char *) inbuf;
452     char *__restrict__ dbuf = (char *) outbuf;
453     uintptr_t extent = md->extent;
454     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
455     uintptr_t res = idx;
456     uintptr_t inner_elements = md->num_elements;
457 
458     if (idx >= (count * inner_elements))
459         return;
460 
461     uintptr_t x0 = res / inner_elements;
462     res %= inner_elements;
463     inner_elements /= md->u.contig.count;
464 
465     uintptr_t x1 = res / inner_elements;
466     res %= inner_elements;
467     inner_elements /= md->u.contig.child->u.blkhindx.count;
468 
469     uintptr_t x2 = res / inner_elements;
470     res %= inner_elements;
471     inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
472     uintptr_t x3 = res / inner_elements;
473     res %= inner_elements;
474     inner_elements /= md->u.contig.child->u.blkhindx.child->u.contig.count;
475 
476     uintptr_t x4 = res;
477 
478     intptr_t stride1 = md->u.contig.child->extent;
479     intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
480     intptr_t stride3 = md->u.contig.child->u.blkhindx.child->u.contig.child->extent;
481     uintptr_t extent3 = md->u.contig.child->u.blkhindx.child->extent;
482     *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + x4 * stride3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
483 }
484 
yaksuri_cudai_unpack_contig_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)485 void yaksuri_cudai_unpack_contig_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
486 {
487 void *args[] = { &inbuf, &outbuf, &count, &md };
488     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_blkhindx_contig__Bool,
489         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
490     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
491 }
492 
yaksuri_cudai_kernel_pack_resized_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)493 __global__ void yaksuri_cudai_kernel_pack_resized_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
494 {
495     const char *__restrict__ sbuf = (const char *) inbuf;
496     char *__restrict__ dbuf = (char *) outbuf;
497     uintptr_t extent = md->extent;
498     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
499     uintptr_t res = idx;
500     uintptr_t inner_elements = md->num_elements;
501 
502     if (idx >= (count * inner_elements))
503         return;
504 
505     uintptr_t x0 = res / inner_elements;
506     res %= inner_elements;
507     inner_elements /= md->u.resized.child->u.blkhindx.count;
508 
509     uintptr_t x1 = res / inner_elements;
510     res %= inner_elements;
511     inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
512     uintptr_t x2 = res / inner_elements;
513     res %= inner_elements;
514     inner_elements /= md->u.resized.child->u.blkhindx.child->u.contig.count;
515 
516     uintptr_t x3 = res;
517 
518     intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
519     intptr_t stride3 = md->u.resized.child->u.blkhindx.child->u.contig.child->extent;
520     uintptr_t extent3 = md->u.resized.child->u.blkhindx.child->extent;
521     *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + x3 * stride3));
522 }
523 
yaksuri_cudai_pack_resized_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)524 void yaksuri_cudai_pack_resized_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
525 {
526 void *args[] = { &inbuf, &outbuf, &count, &md };
527     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_blkhindx_contig__Bool,
528         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
529     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
530 }
531 
yaksuri_cudai_kernel_unpack_resized_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)532 __global__ void yaksuri_cudai_kernel_unpack_resized_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
533 {
534     const char *__restrict__ sbuf = (const char *) inbuf;
535     char *__restrict__ dbuf = (char *) outbuf;
536     uintptr_t extent = md->extent;
537     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
538     uintptr_t res = idx;
539     uintptr_t inner_elements = md->num_elements;
540 
541     if (idx >= (count * inner_elements))
542         return;
543 
544     uintptr_t x0 = res / inner_elements;
545     res %= inner_elements;
546     inner_elements /= md->u.resized.child->u.blkhindx.count;
547 
548     uintptr_t x1 = res / inner_elements;
549     res %= inner_elements;
550     inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
551     uintptr_t x2 = res / inner_elements;
552     res %= inner_elements;
553     inner_elements /= md->u.resized.child->u.blkhindx.child->u.contig.count;
554 
555     uintptr_t x3 = res;
556 
557     intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
558     intptr_t stride3 = md->u.resized.child->u.blkhindx.child->u.contig.child->extent;
559     uintptr_t extent3 = md->u.resized.child->u.blkhindx.child->extent;
560     *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + x3 * stride3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
561 }
562 
yaksuri_cudai_unpack_resized_blkhindx_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)563 void yaksuri_cudai_unpack_resized_blkhindx_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
564 {
565 void *args[] = { &inbuf, &outbuf, &count, &md };
566     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_blkhindx_contig__Bool,
567         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
568     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
569 }
570 
571