1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.contig.count;
32 
33     uintptr_t x1 = res / inner_elements;
34     res %= inner_elements;
35     inner_elements /= md->u.contig.child->u.hindexed.count;
36 
37     uintptr_t x2;
38     for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
39             uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
40                                  md->u.contig.child->u.hindexed.child->num_elements;
41             if (res < in_elems) {
42                     x2 = i;
43                     res %= in_elems;
44                     inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
45                     break;
46             } else {
47                     res -= in_elems;
48             }
49     }
50 
51     uintptr_t x3 = res;
52 
53     intptr_t stride1 = md->u.contig.child->extent;
54     intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
55     *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * sizeof(_Bool)));
56 }
57 
yaksuri_cudai_pack_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)58 void yaksuri_cudai_pack_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
59 {
60 void *args[] = { &inbuf, &outbuf, &count, &md };
61     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_hindexed__Bool,
62         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
63     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
64 }
65 
yaksuri_cudai_kernel_unpack_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)66 __global__ void yaksuri_cudai_kernel_unpack_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
67 {
68     const char *__restrict__ sbuf = (const char *) inbuf;
69     char *__restrict__ dbuf = (char *) outbuf;
70     uintptr_t extent = md->extent;
71     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
72     uintptr_t res = idx;
73     uintptr_t inner_elements = md->num_elements;
74 
75     if (idx >= (count * inner_elements))
76         return;
77 
78     uintptr_t x0 = res / inner_elements;
79     res %= inner_elements;
80     inner_elements /= md->u.contig.count;
81 
82     uintptr_t x1 = res / inner_elements;
83     res %= inner_elements;
84     inner_elements /= md->u.contig.child->u.hindexed.count;
85 
86     uintptr_t x2;
87     for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
88             uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
89                                  md->u.contig.child->u.hindexed.child->num_elements;
90             if (res < in_elems) {
91                     x2 = i;
92                     res %= in_elems;
93                     inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
94                     break;
95             } else {
96                     res -= in_elems;
97             }
98     }
99 
100     uintptr_t x3 = res;
101 
102     intptr_t stride1 = md->u.contig.child->extent;
103     intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
104     *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
105 }
106 
yaksuri_cudai_unpack_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)107 void yaksuri_cudai_unpack_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
108 {
109 void *args[] = { &inbuf, &outbuf, &count, &md };
110     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_hindexed__Bool,
111         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
112     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
113 }
114 
yaksuri_cudai_kernel_pack_hvector_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)115 __global__ void yaksuri_cudai_kernel_pack_hvector_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
116 {
117     const char *__restrict__ sbuf = (const char *) inbuf;
118     char *__restrict__ dbuf = (char *) outbuf;
119     uintptr_t extent = md->extent;
120     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
121     uintptr_t res = idx;
122     uintptr_t inner_elements = md->num_elements;
123 
124     if (idx >= (count * inner_elements))
125         return;
126 
127     uintptr_t x0 = res / inner_elements;
128     res %= inner_elements;
129     inner_elements /= md->u.hvector.count;
130 
131     uintptr_t x1 = res / inner_elements;
132     res %= inner_elements;
133     inner_elements /= md->u.hvector.blocklength;
134     uintptr_t x2 = res / inner_elements;
135     res %= inner_elements;
136     inner_elements /= md->u.hvector.child->u.contig.count;
137 
138     uintptr_t x3 = res / inner_elements;
139     res %= inner_elements;
140     inner_elements /= md->u.hvector.child->u.contig.child->u.hindexed.count;
141 
142     uintptr_t x4;
143     for (int i = 0; i < md->u.hvector.child->u.contig.child->u.hindexed.count; i++) {
144             uintptr_t in_elems = md->u.hvector.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
145                                  md->u.hvector.child->u.contig.child->u.hindexed.child->num_elements;
146             if (res < in_elems) {
147                     x4 = i;
148                     res %= in_elems;
149                     inner_elements = md->u.hvector.child->u.contig.child->u.hindexed.child->num_elements;
150                     break;
151             } else {
152                     res -= in_elems;
153             }
154     }
155 
156     uintptr_t x5 = res;
157 
158     intptr_t stride1 = md->u.hvector.stride;
159     intptr_t stride2 = md->u.hvector.child->u.contig.child->extent;
160     uintptr_t extent2 = md->u.hvector.child->extent;
161     intptr_t *array_of_displs3 = md->u.hvector.child->u.contig.child->u.hindexed.array_of_displs;
162     *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(_Bool)));
163 }
164 
yaksuri_cudai_pack_hvector_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)165 void yaksuri_cudai_pack_hvector_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
166 {
167 void *args[] = { &inbuf, &outbuf, &count, &md };
168     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_contig_hindexed__Bool,
169         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
170     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
171 }
172 
yaksuri_cudai_kernel_unpack_hvector_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)173 __global__ void yaksuri_cudai_kernel_unpack_hvector_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
174 {
175     const char *__restrict__ sbuf = (const char *) inbuf;
176     char *__restrict__ dbuf = (char *) outbuf;
177     uintptr_t extent = md->extent;
178     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
179     uintptr_t res = idx;
180     uintptr_t inner_elements = md->num_elements;
181 
182     if (idx >= (count * inner_elements))
183         return;
184 
185     uintptr_t x0 = res / inner_elements;
186     res %= inner_elements;
187     inner_elements /= md->u.hvector.count;
188 
189     uintptr_t x1 = res / inner_elements;
190     res %= inner_elements;
191     inner_elements /= md->u.hvector.blocklength;
192     uintptr_t x2 = res / inner_elements;
193     res %= inner_elements;
194     inner_elements /= md->u.hvector.child->u.contig.count;
195 
196     uintptr_t x3 = res / inner_elements;
197     res %= inner_elements;
198     inner_elements /= md->u.hvector.child->u.contig.child->u.hindexed.count;
199 
200     uintptr_t x4;
201     for (int i = 0; i < md->u.hvector.child->u.contig.child->u.hindexed.count; i++) {
202             uintptr_t in_elems = md->u.hvector.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
203                                  md->u.hvector.child->u.contig.child->u.hindexed.child->num_elements;
204             if (res < in_elems) {
205                     x4 = i;
206                     res %= in_elems;
207                     inner_elements = md->u.hvector.child->u.contig.child->u.hindexed.child->num_elements;
208                     break;
209             } else {
210                     res -= in_elems;
211             }
212     }
213 
214     uintptr_t x5 = res;
215 
216     intptr_t stride1 = md->u.hvector.stride;
217     intptr_t stride2 = md->u.hvector.child->u.contig.child->extent;
218     uintptr_t extent2 = md->u.hvector.child->extent;
219     intptr_t *array_of_displs3 = md->u.hvector.child->u.contig.child->u.hindexed.array_of_displs;
220     *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
221 }
222 
yaksuri_cudai_unpack_hvector_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)223 void yaksuri_cudai_unpack_hvector_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
224 {
225 void *args[] = { &inbuf, &outbuf, &count, &md };
226     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_contig_hindexed__Bool,
227         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
228     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
229 }
230 
yaksuri_cudai_kernel_pack_blkhindx_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)231 __global__ void yaksuri_cudai_kernel_pack_blkhindx_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
232 {
233     const char *__restrict__ sbuf = (const char *) inbuf;
234     char *__restrict__ dbuf = (char *) outbuf;
235     uintptr_t extent = md->extent;
236     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
237     uintptr_t res = idx;
238     uintptr_t inner_elements = md->num_elements;
239 
240     if (idx >= (count * inner_elements))
241         return;
242 
243     uintptr_t x0 = res / inner_elements;
244     res %= inner_elements;
245     inner_elements /= md->u.blkhindx.count;
246 
247     uintptr_t x1 = res / inner_elements;
248     res %= inner_elements;
249     inner_elements /= md->u.blkhindx.blocklength;
250     uintptr_t x2 = res / inner_elements;
251     res %= inner_elements;
252     inner_elements /= md->u.blkhindx.child->u.contig.count;
253 
254     uintptr_t x3 = res / inner_elements;
255     res %= inner_elements;
256     inner_elements /= md->u.blkhindx.child->u.contig.child->u.hindexed.count;
257 
258     uintptr_t x4;
259     for (int i = 0; i < md->u.blkhindx.child->u.contig.child->u.hindexed.count; i++) {
260             uintptr_t in_elems = md->u.blkhindx.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
261                                  md->u.blkhindx.child->u.contig.child->u.hindexed.child->num_elements;
262             if (res < in_elems) {
263                     x4 = i;
264                     res %= in_elems;
265                     inner_elements = md->u.blkhindx.child->u.contig.child->u.hindexed.child->num_elements;
266                     break;
267             } else {
268                     res -= in_elems;
269             }
270     }
271 
272     uintptr_t x5 = res;
273 
274     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
275     intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
276     uintptr_t extent2 = md->u.blkhindx.child->extent;
277     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.contig.child->u.hindexed.array_of_displs;
278     *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(_Bool)));
279 }
280 
yaksuri_cudai_pack_blkhindx_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)281 void yaksuri_cudai_pack_blkhindx_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
282 {
283 void *args[] = { &inbuf, &outbuf, &count, &md };
284     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_contig_hindexed__Bool,
285         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
286     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
287 }
288 
yaksuri_cudai_kernel_unpack_blkhindx_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)289 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
290 {
291     const char *__restrict__ sbuf = (const char *) inbuf;
292     char *__restrict__ dbuf = (char *) outbuf;
293     uintptr_t extent = md->extent;
294     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
295     uintptr_t res = idx;
296     uintptr_t inner_elements = md->num_elements;
297 
298     if (idx >= (count * inner_elements))
299         return;
300 
301     uintptr_t x0 = res / inner_elements;
302     res %= inner_elements;
303     inner_elements /= md->u.blkhindx.count;
304 
305     uintptr_t x1 = res / inner_elements;
306     res %= inner_elements;
307     inner_elements /= md->u.blkhindx.blocklength;
308     uintptr_t x2 = res / inner_elements;
309     res %= inner_elements;
310     inner_elements /= md->u.blkhindx.child->u.contig.count;
311 
312     uintptr_t x3 = res / inner_elements;
313     res %= inner_elements;
314     inner_elements /= md->u.blkhindx.child->u.contig.child->u.hindexed.count;
315 
316     uintptr_t x4;
317     for (int i = 0; i < md->u.blkhindx.child->u.contig.child->u.hindexed.count; i++) {
318             uintptr_t in_elems = md->u.blkhindx.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
319                                  md->u.blkhindx.child->u.contig.child->u.hindexed.child->num_elements;
320             if (res < in_elems) {
321                     x4 = i;
322                     res %= in_elems;
323                     inner_elements = md->u.blkhindx.child->u.contig.child->u.hindexed.child->num_elements;
324                     break;
325             } else {
326                     res -= in_elems;
327             }
328     }
329 
330     uintptr_t x5 = res;
331 
332     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
333     intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
334     uintptr_t extent2 = md->u.blkhindx.child->extent;
335     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.contig.child->u.hindexed.array_of_displs;
336     *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
337 }
338 
yaksuri_cudai_unpack_blkhindx_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)339 void yaksuri_cudai_unpack_blkhindx_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
340 {
341 void *args[] = { &inbuf, &outbuf, &count, &md };
342     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_contig_hindexed__Bool,
343         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
344     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
345 }
346 
yaksuri_cudai_kernel_pack_hindexed_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)347 __global__ void yaksuri_cudai_kernel_pack_hindexed_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
348 {
349     const char *__restrict__ sbuf = (const char *) inbuf;
350     char *__restrict__ dbuf = (char *) outbuf;
351     uintptr_t extent = md->extent;
352     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
353     uintptr_t res = idx;
354     uintptr_t inner_elements = md->num_elements;
355 
356     if (idx >= (count * inner_elements))
357         return;
358 
359     uintptr_t x0 = res / inner_elements;
360     res %= inner_elements;
361     inner_elements /= md->u.hindexed.count;
362 
363     uintptr_t x1;
364     for (int i = 0; i < md->u.hindexed.count; i++) {
365             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
366                                  md->u.hindexed.child->num_elements;
367             if (res < in_elems) {
368                     x1 = i;
369                     res %= in_elems;
370                     inner_elements = md->u.hindexed.child->num_elements;
371                     break;
372             } else {
373                     res -= in_elems;
374             }
375     }
376 
377     uintptr_t x2 = res / inner_elements;
378     res %= inner_elements;
379     inner_elements /= md->u.hindexed.child->u.contig.count;
380 
381     uintptr_t x3 = res / inner_elements;
382     res %= inner_elements;
383     inner_elements /= md->u.hindexed.child->u.contig.child->u.hindexed.count;
384 
385     uintptr_t x4;
386     for (int i = 0; i < md->u.hindexed.child->u.contig.child->u.hindexed.count; i++) {
387             uintptr_t in_elems = md->u.hindexed.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
388                                  md->u.hindexed.child->u.contig.child->u.hindexed.child->num_elements;
389             if (res < in_elems) {
390                     x4 = i;
391                     res %= in_elems;
392                     inner_elements = md->u.hindexed.child->u.contig.child->u.hindexed.child->num_elements;
393                     break;
394             } else {
395                     res -= in_elems;
396             }
397     }
398 
399     uintptr_t x5 = res;
400 
401     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
402     intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
403     uintptr_t extent2 = md->u.hindexed.child->extent;
404     intptr_t *array_of_displs3 = md->u.hindexed.child->u.contig.child->u.hindexed.array_of_displs;
405     *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(_Bool)));
406 }
407 
yaksuri_cudai_pack_hindexed_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)408 void yaksuri_cudai_pack_hindexed_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
409 {
410 void *args[] = { &inbuf, &outbuf, &count, &md };
411     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_contig_hindexed__Bool,
412         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
413     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
414 }
415 
yaksuri_cudai_kernel_unpack_hindexed_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)416 __global__ void yaksuri_cudai_kernel_unpack_hindexed_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
417 {
418     const char *__restrict__ sbuf = (const char *) inbuf;
419     char *__restrict__ dbuf = (char *) outbuf;
420     uintptr_t extent = md->extent;
421     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
422     uintptr_t res = idx;
423     uintptr_t inner_elements = md->num_elements;
424 
425     if (idx >= (count * inner_elements))
426         return;
427 
428     uintptr_t x0 = res / inner_elements;
429     res %= inner_elements;
430     inner_elements /= md->u.hindexed.count;
431 
432     uintptr_t x1;
433     for (int i = 0; i < md->u.hindexed.count; i++) {
434             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
435                                  md->u.hindexed.child->num_elements;
436             if (res < in_elems) {
437                     x1 = i;
438                     res %= in_elems;
439                     inner_elements = md->u.hindexed.child->num_elements;
440                     break;
441             } else {
442                     res -= in_elems;
443             }
444     }
445 
446     uintptr_t x2 = res / inner_elements;
447     res %= inner_elements;
448     inner_elements /= md->u.hindexed.child->u.contig.count;
449 
450     uintptr_t x3 = res / inner_elements;
451     res %= inner_elements;
452     inner_elements /= md->u.hindexed.child->u.contig.child->u.hindexed.count;
453 
454     uintptr_t x4;
455     for (int i = 0; i < md->u.hindexed.child->u.contig.child->u.hindexed.count; i++) {
456             uintptr_t in_elems = md->u.hindexed.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
457                                  md->u.hindexed.child->u.contig.child->u.hindexed.child->num_elements;
458             if (res < in_elems) {
459                     x4 = i;
460                     res %= in_elems;
461                     inner_elements = md->u.hindexed.child->u.contig.child->u.hindexed.child->num_elements;
462                     break;
463             } else {
464                     res -= in_elems;
465             }
466     }
467 
468     uintptr_t x5 = res;
469 
470     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
471     intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
472     uintptr_t extent2 = md->u.hindexed.child->extent;
473     intptr_t *array_of_displs3 = md->u.hindexed.child->u.contig.child->u.hindexed.array_of_displs;
474     *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
475 }
476 
yaksuri_cudai_unpack_hindexed_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)477 void yaksuri_cudai_unpack_hindexed_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
478 {
479 void *args[] = { &inbuf, &outbuf, &count, &md };
480     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_contig_hindexed__Bool,
481         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
482     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
483 }
484 
yaksuri_cudai_kernel_pack_contig_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)485 __global__ void yaksuri_cudai_kernel_pack_contig_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
486 {
487     const char *__restrict__ sbuf = (const char *) inbuf;
488     char *__restrict__ dbuf = (char *) outbuf;
489     uintptr_t extent = md->extent;
490     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
491     uintptr_t res = idx;
492     uintptr_t inner_elements = md->num_elements;
493 
494     if (idx >= (count * inner_elements))
495         return;
496 
497     uintptr_t x0 = res / inner_elements;
498     res %= inner_elements;
499     inner_elements /= md->u.contig.count;
500 
501     uintptr_t x1 = res / inner_elements;
502     res %= inner_elements;
503     inner_elements /= md->u.contig.child->u.contig.count;
504 
505     uintptr_t x2 = res / inner_elements;
506     res %= inner_elements;
507     inner_elements /= md->u.contig.child->u.contig.child->u.hindexed.count;
508 
509     uintptr_t x3;
510     for (int i = 0; i < md->u.contig.child->u.contig.child->u.hindexed.count; i++) {
511             uintptr_t in_elems = md->u.contig.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
512                                  md->u.contig.child->u.contig.child->u.hindexed.child->num_elements;
513             if (res < in_elems) {
514                     x3 = i;
515                     res %= in_elems;
516                     inner_elements = md->u.contig.child->u.contig.child->u.hindexed.child->num_elements;
517                     break;
518             } else {
519                     res -= in_elems;
520             }
521     }
522 
523     uintptr_t x4 = res;
524 
525     intptr_t stride1 = md->u.contig.child->extent;
526     intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
527     intptr_t *array_of_displs3 = md->u.contig.child->u.contig.child->u.hindexed.array_of_displs;
528     *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * stride2 + array_of_displs3[x3] + x4 * sizeof(_Bool)));
529 }
530 
yaksuri_cudai_pack_contig_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)531 void yaksuri_cudai_pack_contig_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
532 {
533 void *args[] = { &inbuf, &outbuf, &count, &md };
534     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_contig_hindexed__Bool,
535         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
536     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
537 }
538 
yaksuri_cudai_kernel_unpack_contig_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)539 __global__ void yaksuri_cudai_kernel_unpack_contig_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
540 {
541     const char *__restrict__ sbuf = (const char *) inbuf;
542     char *__restrict__ dbuf = (char *) outbuf;
543     uintptr_t extent = md->extent;
544     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
545     uintptr_t res = idx;
546     uintptr_t inner_elements = md->num_elements;
547 
548     if (idx >= (count * inner_elements))
549         return;
550 
551     uintptr_t x0 = res / inner_elements;
552     res %= inner_elements;
553     inner_elements /= md->u.contig.count;
554 
555     uintptr_t x1 = res / inner_elements;
556     res %= inner_elements;
557     inner_elements /= md->u.contig.child->u.contig.count;
558 
559     uintptr_t x2 = res / inner_elements;
560     res %= inner_elements;
561     inner_elements /= md->u.contig.child->u.contig.child->u.hindexed.count;
562 
563     uintptr_t x3;
564     for (int i = 0; i < md->u.contig.child->u.contig.child->u.hindexed.count; i++) {
565             uintptr_t in_elems = md->u.contig.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
566                                  md->u.contig.child->u.contig.child->u.hindexed.child->num_elements;
567             if (res < in_elems) {
568                     x3 = i;
569                     res %= in_elems;
570                     inner_elements = md->u.contig.child->u.contig.child->u.hindexed.child->num_elements;
571                     break;
572             } else {
573                     res -= in_elems;
574             }
575     }
576 
577     uintptr_t x4 = res;
578 
579     intptr_t stride1 = md->u.contig.child->extent;
580     intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
581     intptr_t *array_of_displs3 = md->u.contig.child->u.contig.child->u.hindexed.array_of_displs;
582     *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * stride2 + array_of_displs3[x3] + x4 * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
583 }
584 
yaksuri_cudai_unpack_contig_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)585 void yaksuri_cudai_unpack_contig_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
586 {
587 void *args[] = { &inbuf, &outbuf, &count, &md };
588     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_contig_hindexed__Bool,
589         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
590     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
591 }
592 
yaksuri_cudai_kernel_pack_resized_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)593 __global__ void yaksuri_cudai_kernel_pack_resized_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
594 {
595     const char *__restrict__ sbuf = (const char *) inbuf;
596     char *__restrict__ dbuf = (char *) outbuf;
597     uintptr_t extent = md->extent;
598     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
599     uintptr_t res = idx;
600     uintptr_t inner_elements = md->num_elements;
601 
602     if (idx >= (count * inner_elements))
603         return;
604 
605     uintptr_t x0 = res / inner_elements;
606     res %= inner_elements;
607     inner_elements /= md->u.resized.child->u.contig.count;
608 
609     uintptr_t x1 = res / inner_elements;
610     res %= inner_elements;
611     inner_elements /= md->u.resized.child->u.contig.child->u.hindexed.count;
612 
613     uintptr_t x2;
614     for (int i = 0; i < md->u.resized.child->u.contig.child->u.hindexed.count; i++) {
615             uintptr_t in_elems = md->u.resized.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
616                                  md->u.resized.child->u.contig.child->u.hindexed.child->num_elements;
617             if (res < in_elems) {
618                     x2 = i;
619                     res %= in_elems;
620                     inner_elements = md->u.resized.child->u.contig.child->u.hindexed.child->num_elements;
621                     break;
622             } else {
623                     res -= in_elems;
624             }
625     }
626 
627     uintptr_t x3 = res;
628 
629     intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
630     intptr_t *array_of_displs3 = md->u.resized.child->u.contig.child->u.hindexed.array_of_displs;
631     *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride2 + array_of_displs3[x2] + x3 * sizeof(_Bool)));
632 }
633 
yaksuri_cudai_pack_resized_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)634 void yaksuri_cudai_pack_resized_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
635 {
636 void *args[] = { &inbuf, &outbuf, &count, &md };
637     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_contig_hindexed__Bool,
638         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
639     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
640 }
641 
yaksuri_cudai_kernel_unpack_resized_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)642 __global__ void yaksuri_cudai_kernel_unpack_resized_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
643 {
644     const char *__restrict__ sbuf = (const char *) inbuf;
645     char *__restrict__ dbuf = (char *) outbuf;
646     uintptr_t extent = md->extent;
647     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
648     uintptr_t res = idx;
649     uintptr_t inner_elements = md->num_elements;
650 
651     if (idx >= (count * inner_elements))
652         return;
653 
654     uintptr_t x0 = res / inner_elements;
655     res %= inner_elements;
656     inner_elements /= md->u.resized.child->u.contig.count;
657 
658     uintptr_t x1 = res / inner_elements;
659     res %= inner_elements;
660     inner_elements /= md->u.resized.child->u.contig.child->u.hindexed.count;
661 
662     uintptr_t x2;
663     for (int i = 0; i < md->u.resized.child->u.contig.child->u.hindexed.count; i++) {
664             uintptr_t in_elems = md->u.resized.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
665                                  md->u.resized.child->u.contig.child->u.hindexed.child->num_elements;
666             if (res < in_elems) {
667                     x2 = i;
668                     res %= in_elems;
669                     inner_elements = md->u.resized.child->u.contig.child->u.hindexed.child->num_elements;
670                     break;
671             } else {
672                     res -= in_elems;
673             }
674     }
675 
676     uintptr_t x3 = res;
677 
678     intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
679     intptr_t *array_of_displs3 = md->u.resized.child->u.contig.child->u.hindexed.array_of_displs;
680     *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride2 + array_of_displs3[x2] + x3 * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
681 }
682 
yaksuri_cudai_unpack_resized_contig_hindexed__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)683 void yaksuri_cudai_unpack_resized_contig_hindexed__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
684 {
685 void *args[] = { &inbuf, &outbuf, &count, &md };
686     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_contig_hindexed__Bool,
687         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
688     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
689 }
690 
691