1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.hindexed.count;
32 
33     uintptr_t x1;
34     for (int i = 0; i < md->u.hindexed.count; i++) {
35             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
36                                  md->u.hindexed.child->num_elements;
37             if (res < in_elems) {
38                     x1 = i;
39                     res %= in_elems;
40                     inner_elements = md->u.hindexed.child->num_elements;
41                     break;
42             } else {
43                     res -= in_elems;
44             }
45     }
46 
47     uintptr_t x2 = res / inner_elements;
48     res %= inner_elements;
49     inner_elements /= md->u.hindexed.child->u.contig.count;
50 
51     uintptr_t x3 = res;
52 
53     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
54     intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
55     uintptr_t extent2 = md->u.hindexed.child->extent;
56     *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2));
57 }
58 
yaksuri_cudai_pack_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)59 void yaksuri_cudai_pack_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
60 {
61 void *args[] = { &inbuf, &outbuf, &count, &md };
62     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_contig_char,
63         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
64     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
65 }
66 
yaksuri_cudai_kernel_unpack_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)67 __global__ void yaksuri_cudai_kernel_unpack_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
68 {
69     const char *__restrict__ sbuf = (const char *) inbuf;
70     char *__restrict__ dbuf = (char *) outbuf;
71     uintptr_t extent = md->extent;
72     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
73     uintptr_t res = idx;
74     uintptr_t inner_elements = md->num_elements;
75 
76     if (idx >= (count * inner_elements))
77         return;
78 
79     uintptr_t x0 = res / inner_elements;
80     res %= inner_elements;
81     inner_elements /= md->u.hindexed.count;
82 
83     uintptr_t x1;
84     for (int i = 0; i < md->u.hindexed.count; i++) {
85             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
86                                  md->u.hindexed.child->num_elements;
87             if (res < in_elems) {
88                     x1 = i;
89                     res %= in_elems;
90                     inner_elements = md->u.hindexed.child->num_elements;
91                     break;
92             } else {
93                     res -= in_elems;
94             }
95     }
96 
97     uintptr_t x2 = res / inner_elements;
98     res %= inner_elements;
99     inner_elements /= md->u.hindexed.child->u.contig.count;
100 
101     uintptr_t x3 = res;
102 
103     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
104     intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
105     uintptr_t extent2 = md->u.hindexed.child->extent;
106     *((char *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2)) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
107 }
108 
yaksuri_cudai_unpack_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)109 void yaksuri_cudai_unpack_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
110 {
111 void *args[] = { &inbuf, &outbuf, &count, &md };
112     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_contig_char,
113         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
114     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
115 }
116 
yaksuri_cudai_kernel_pack_hvector_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)117 __global__ void yaksuri_cudai_kernel_pack_hvector_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
118 {
119     const char *__restrict__ sbuf = (const char *) inbuf;
120     char *__restrict__ dbuf = (char *) outbuf;
121     uintptr_t extent = md->extent;
122     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
123     uintptr_t res = idx;
124     uintptr_t inner_elements = md->num_elements;
125 
126     if (idx >= (count * inner_elements))
127         return;
128 
129     uintptr_t x0 = res / inner_elements;
130     res %= inner_elements;
131     inner_elements /= md->u.hvector.count;
132 
133     uintptr_t x1 = res / inner_elements;
134     res %= inner_elements;
135     inner_elements /= md->u.hvector.blocklength;
136     uintptr_t x2 = res / inner_elements;
137     res %= inner_elements;
138     inner_elements /= md->u.hvector.child->u.hindexed.count;
139 
140     uintptr_t x3;
141     for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
142             uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
143                                  md->u.hvector.child->u.hindexed.child->num_elements;
144             if (res < in_elems) {
145                     x3 = i;
146                     res %= in_elems;
147                     inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
148                     break;
149             } else {
150                     res -= in_elems;
151             }
152     }
153 
154     uintptr_t x4 = res / inner_elements;
155     res %= inner_elements;
156     inner_elements /= md->u.hvector.child->u.hindexed.child->u.contig.count;
157 
158     uintptr_t x5 = res;
159 
160     intptr_t stride1 = md->u.hvector.stride;
161     intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
162     uintptr_t extent2 = md->u.hvector.child->extent;
163     intptr_t stride3 = md->u.hvector.child->u.hindexed.child->u.contig.child->extent;
164     uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
165     *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3));
166 }
167 
yaksuri_cudai_pack_hvector_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)168 void yaksuri_cudai_pack_hvector_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
169 {
170 void *args[] = { &inbuf, &outbuf, &count, &md };
171     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_hindexed_contig_char,
172         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
173     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
174 }
175 
yaksuri_cudai_kernel_unpack_hvector_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)176 __global__ void yaksuri_cudai_kernel_unpack_hvector_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
177 {
178     const char *__restrict__ sbuf = (const char *) inbuf;
179     char *__restrict__ dbuf = (char *) outbuf;
180     uintptr_t extent = md->extent;
181     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
182     uintptr_t res = idx;
183     uintptr_t inner_elements = md->num_elements;
184 
185     if (idx >= (count * inner_elements))
186         return;
187 
188     uintptr_t x0 = res / inner_elements;
189     res %= inner_elements;
190     inner_elements /= md->u.hvector.count;
191 
192     uintptr_t x1 = res / inner_elements;
193     res %= inner_elements;
194     inner_elements /= md->u.hvector.blocklength;
195     uintptr_t x2 = res / inner_elements;
196     res %= inner_elements;
197     inner_elements /= md->u.hvector.child->u.hindexed.count;
198 
199     uintptr_t x3;
200     for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
201             uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
202                                  md->u.hvector.child->u.hindexed.child->num_elements;
203             if (res < in_elems) {
204                     x3 = i;
205                     res %= in_elems;
206                     inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
207                     break;
208             } else {
209                     res -= in_elems;
210             }
211     }
212 
213     uintptr_t x4 = res / inner_elements;
214     res %= inner_elements;
215     inner_elements /= md->u.hvector.child->u.hindexed.child->u.contig.count;
216 
217     uintptr_t x5 = res;
218 
219     intptr_t stride1 = md->u.hvector.stride;
220     intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
221     uintptr_t extent2 = md->u.hvector.child->extent;
222     intptr_t stride3 = md->u.hvector.child->u.hindexed.child->u.contig.child->extent;
223     uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
224     *((char *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3)) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
225 }
226 
yaksuri_cudai_unpack_hvector_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)227 void yaksuri_cudai_unpack_hvector_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
228 {
229 void *args[] = { &inbuf, &outbuf, &count, &md };
230     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_hindexed_contig_char,
231         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
232     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
233 }
234 
yaksuri_cudai_kernel_pack_blkhindx_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)235 __global__ void yaksuri_cudai_kernel_pack_blkhindx_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
236 {
237     const char *__restrict__ sbuf = (const char *) inbuf;
238     char *__restrict__ dbuf = (char *) outbuf;
239     uintptr_t extent = md->extent;
240     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
241     uintptr_t res = idx;
242     uintptr_t inner_elements = md->num_elements;
243 
244     if (idx >= (count * inner_elements))
245         return;
246 
247     uintptr_t x0 = res / inner_elements;
248     res %= inner_elements;
249     inner_elements /= md->u.blkhindx.count;
250 
251     uintptr_t x1 = res / inner_elements;
252     res %= inner_elements;
253     inner_elements /= md->u.blkhindx.blocklength;
254     uintptr_t x2 = res / inner_elements;
255     res %= inner_elements;
256     inner_elements /= md->u.blkhindx.child->u.hindexed.count;
257 
258     uintptr_t x3;
259     for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
260             uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
261                                  md->u.blkhindx.child->u.hindexed.child->num_elements;
262             if (res < in_elems) {
263                     x3 = i;
264                     res %= in_elems;
265                     inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
266                     break;
267             } else {
268                     res -= in_elems;
269             }
270     }
271 
272     uintptr_t x4 = res / inner_elements;
273     res %= inner_elements;
274     inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.contig.count;
275 
276     uintptr_t x5 = res;
277 
278     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
279     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
280     uintptr_t extent2 = md->u.blkhindx.child->extent;
281     intptr_t stride3 = md->u.blkhindx.child->u.hindexed.child->u.contig.child->extent;
282     uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
283     *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3));
284 }
285 
yaksuri_cudai_pack_blkhindx_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)286 void yaksuri_cudai_pack_blkhindx_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
287 {
288 void *args[] = { &inbuf, &outbuf, &count, &md };
289     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_hindexed_contig_char,
290         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
291     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
292 }
293 
yaksuri_cudai_kernel_unpack_blkhindx_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)294 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
295 {
296     const char *__restrict__ sbuf = (const char *) inbuf;
297     char *__restrict__ dbuf = (char *) outbuf;
298     uintptr_t extent = md->extent;
299     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
300     uintptr_t res = idx;
301     uintptr_t inner_elements = md->num_elements;
302 
303     if (idx >= (count * inner_elements))
304         return;
305 
306     uintptr_t x0 = res / inner_elements;
307     res %= inner_elements;
308     inner_elements /= md->u.blkhindx.count;
309 
310     uintptr_t x1 = res / inner_elements;
311     res %= inner_elements;
312     inner_elements /= md->u.blkhindx.blocklength;
313     uintptr_t x2 = res / inner_elements;
314     res %= inner_elements;
315     inner_elements /= md->u.blkhindx.child->u.hindexed.count;
316 
317     uintptr_t x3;
318     for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
319             uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
320                                  md->u.blkhindx.child->u.hindexed.child->num_elements;
321             if (res < in_elems) {
322                     x3 = i;
323                     res %= in_elems;
324                     inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
325                     break;
326             } else {
327                     res -= in_elems;
328             }
329     }
330 
331     uintptr_t x4 = res / inner_elements;
332     res %= inner_elements;
333     inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.contig.count;
334 
335     uintptr_t x5 = res;
336 
337     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
338     intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
339     uintptr_t extent2 = md->u.blkhindx.child->extent;
340     intptr_t stride3 = md->u.blkhindx.child->u.hindexed.child->u.contig.child->extent;
341     uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
342     *((char *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3)) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
343 }
344 
yaksuri_cudai_unpack_blkhindx_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)345 void yaksuri_cudai_unpack_blkhindx_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
346 {
347 void *args[] = { &inbuf, &outbuf, &count, &md };
348     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_hindexed_contig_char,
349         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
350     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
351 }
352 
yaksuri_cudai_kernel_pack_hindexed_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)353 __global__ void yaksuri_cudai_kernel_pack_hindexed_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
354 {
355     const char *__restrict__ sbuf = (const char *) inbuf;
356     char *__restrict__ dbuf = (char *) outbuf;
357     uintptr_t extent = md->extent;
358     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
359     uintptr_t res = idx;
360     uintptr_t inner_elements = md->num_elements;
361 
362     if (idx >= (count * inner_elements))
363         return;
364 
365     uintptr_t x0 = res / inner_elements;
366     res %= inner_elements;
367     inner_elements /= md->u.hindexed.count;
368 
369     uintptr_t x1;
370     for (int i = 0; i < md->u.hindexed.count; i++) {
371             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
372                                  md->u.hindexed.child->num_elements;
373             if (res < in_elems) {
374                     x1 = i;
375                     res %= in_elems;
376                     inner_elements = md->u.hindexed.child->num_elements;
377                     break;
378             } else {
379                     res -= in_elems;
380             }
381     }
382 
383     uintptr_t x2 = res / inner_elements;
384     res %= inner_elements;
385     inner_elements /= md->u.hindexed.child->u.hindexed.count;
386 
387     uintptr_t x3;
388     for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
389             uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
390                                  md->u.hindexed.child->u.hindexed.child->num_elements;
391             if (res < in_elems) {
392                     x3 = i;
393                     res %= in_elems;
394                     inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
395                     break;
396             } else {
397                     res -= in_elems;
398             }
399     }
400 
401     uintptr_t x4 = res / inner_elements;
402     res %= inner_elements;
403     inner_elements /= md->u.hindexed.child->u.hindexed.child->u.contig.count;
404 
405     uintptr_t x5 = res;
406 
407     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
408     intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
409     uintptr_t extent2 = md->u.hindexed.child->extent;
410     intptr_t stride3 = md->u.hindexed.child->u.hindexed.child->u.contig.child->extent;
411     uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
412     *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3));
413 }
414 
yaksuri_cudai_pack_hindexed_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)415 void yaksuri_cudai_pack_hindexed_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
416 {
417 void *args[] = { &inbuf, &outbuf, &count, &md };
418     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_hindexed_contig_char,
419         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
420     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
421 }
422 
yaksuri_cudai_kernel_unpack_hindexed_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)423 __global__ void yaksuri_cudai_kernel_unpack_hindexed_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
424 {
425     const char *__restrict__ sbuf = (const char *) inbuf;
426     char *__restrict__ dbuf = (char *) outbuf;
427     uintptr_t extent = md->extent;
428     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
429     uintptr_t res = idx;
430     uintptr_t inner_elements = md->num_elements;
431 
432     if (idx >= (count * inner_elements))
433         return;
434 
435     uintptr_t x0 = res / inner_elements;
436     res %= inner_elements;
437     inner_elements /= md->u.hindexed.count;
438 
439     uintptr_t x1;
440     for (int i = 0; i < md->u.hindexed.count; i++) {
441             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
442                                  md->u.hindexed.child->num_elements;
443             if (res < in_elems) {
444                     x1 = i;
445                     res %= in_elems;
446                     inner_elements = md->u.hindexed.child->num_elements;
447                     break;
448             } else {
449                     res -= in_elems;
450             }
451     }
452 
453     uintptr_t x2 = res / inner_elements;
454     res %= inner_elements;
455     inner_elements /= md->u.hindexed.child->u.hindexed.count;
456 
457     uintptr_t x3;
458     for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
459             uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
460                                  md->u.hindexed.child->u.hindexed.child->num_elements;
461             if (res < in_elems) {
462                     x3 = i;
463                     res %= in_elems;
464                     inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
465                     break;
466             } else {
467                     res -= in_elems;
468             }
469     }
470 
471     uintptr_t x4 = res / inner_elements;
472     res %= inner_elements;
473     inner_elements /= md->u.hindexed.child->u.hindexed.child->u.contig.count;
474 
475     uintptr_t x5 = res;
476 
477     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
478     intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
479     uintptr_t extent2 = md->u.hindexed.child->extent;
480     intptr_t stride3 = md->u.hindexed.child->u.hindexed.child->u.contig.child->extent;
481     uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
482     *((char *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3)) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
483 }
484 
yaksuri_cudai_unpack_hindexed_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)485 void yaksuri_cudai_unpack_hindexed_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
486 {
487 void *args[] = { &inbuf, &outbuf, &count, &md };
488     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_hindexed_contig_char,
489         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
490     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
491 }
492 
yaksuri_cudai_kernel_pack_contig_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)493 __global__ void yaksuri_cudai_kernel_pack_contig_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
494 {
495     const char *__restrict__ sbuf = (const char *) inbuf;
496     char *__restrict__ dbuf = (char *) outbuf;
497     uintptr_t extent = md->extent;
498     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
499     uintptr_t res = idx;
500     uintptr_t inner_elements = md->num_elements;
501 
502     if (idx >= (count * inner_elements))
503         return;
504 
505     uintptr_t x0 = res / inner_elements;
506     res %= inner_elements;
507     inner_elements /= md->u.contig.count;
508 
509     uintptr_t x1 = res / inner_elements;
510     res %= inner_elements;
511     inner_elements /= md->u.contig.child->u.hindexed.count;
512 
513     uintptr_t x2;
514     for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
515             uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
516                                  md->u.contig.child->u.hindexed.child->num_elements;
517             if (res < in_elems) {
518                     x2 = i;
519                     res %= in_elems;
520                     inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
521                     break;
522             } else {
523                     res -= in_elems;
524             }
525     }
526 
527     uintptr_t x3 = res / inner_elements;
528     res %= inner_elements;
529     inner_elements /= md->u.contig.child->u.hindexed.child->u.contig.count;
530 
531     uintptr_t x4 = res;
532 
533     intptr_t stride1 = md->u.contig.child->extent;
534     intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
535     intptr_t stride3 = md->u.contig.child->u.hindexed.child->u.contig.child->extent;
536     uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
537     *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + x4 * stride3));
538 }
539 
yaksuri_cudai_pack_contig_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)540 void yaksuri_cudai_pack_contig_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
541 {
542 void *args[] = { &inbuf, &outbuf, &count, &md };
543     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_hindexed_contig_char,
544         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
545     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
546 }
547 
yaksuri_cudai_kernel_unpack_contig_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)548 __global__ void yaksuri_cudai_kernel_unpack_contig_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
549 {
550     const char *__restrict__ sbuf = (const char *) inbuf;
551     char *__restrict__ dbuf = (char *) outbuf;
552     uintptr_t extent = md->extent;
553     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
554     uintptr_t res = idx;
555     uintptr_t inner_elements = md->num_elements;
556 
557     if (idx >= (count * inner_elements))
558         return;
559 
560     uintptr_t x0 = res / inner_elements;
561     res %= inner_elements;
562     inner_elements /= md->u.contig.count;
563 
564     uintptr_t x1 = res / inner_elements;
565     res %= inner_elements;
566     inner_elements /= md->u.contig.child->u.hindexed.count;
567 
568     uintptr_t x2;
569     for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
570             uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
571                                  md->u.contig.child->u.hindexed.child->num_elements;
572             if (res < in_elems) {
573                     x2 = i;
574                     res %= in_elems;
575                     inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
576                     break;
577             } else {
578                     res -= in_elems;
579             }
580     }
581 
582     uintptr_t x3 = res / inner_elements;
583     res %= inner_elements;
584     inner_elements /= md->u.contig.child->u.hindexed.child->u.contig.count;
585 
586     uintptr_t x4 = res;
587 
588     intptr_t stride1 = md->u.contig.child->extent;
589     intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
590     intptr_t stride3 = md->u.contig.child->u.hindexed.child->u.contig.child->extent;
591     uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
592     *((char *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + x4 * stride3)) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
593 }
594 
yaksuri_cudai_unpack_contig_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)595 void yaksuri_cudai_unpack_contig_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
596 {
597 void *args[] = { &inbuf, &outbuf, &count, &md };
598     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_hindexed_contig_char,
599         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
600     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
601 }
602 
yaksuri_cudai_kernel_pack_resized_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)603 __global__ void yaksuri_cudai_kernel_pack_resized_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
604 {
605     const char *__restrict__ sbuf = (const char *) inbuf;
606     char *__restrict__ dbuf = (char *) outbuf;
607     uintptr_t extent = md->extent;
608     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
609     uintptr_t res = idx;
610     uintptr_t inner_elements = md->num_elements;
611 
612     if (idx >= (count * inner_elements))
613         return;
614 
615     uintptr_t x0 = res / inner_elements;
616     res %= inner_elements;
617     inner_elements /= md->u.resized.child->u.hindexed.count;
618 
619     uintptr_t x1;
620     for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
621             uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
622                                  md->u.resized.child->u.hindexed.child->num_elements;
623             if (res < in_elems) {
624                     x1 = i;
625                     res %= in_elems;
626                     inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
627                     break;
628             } else {
629                     res -= in_elems;
630             }
631     }
632 
633     uintptr_t x2 = res / inner_elements;
634     res %= inner_elements;
635     inner_elements /= md->u.resized.child->u.hindexed.child->u.contig.count;
636 
637     uintptr_t x3 = res;
638 
639     intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
640     intptr_t stride3 = md->u.resized.child->u.hindexed.child->u.contig.child->extent;
641     uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
642     *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + x3 * stride3));
643 }
644 
yaksuri_cudai_pack_resized_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)645 void yaksuri_cudai_pack_resized_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
646 {
647 void *args[] = { &inbuf, &outbuf, &count, &md };
648     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_hindexed_contig_char,
649         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
650     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
651 }
652 
yaksuri_cudai_kernel_unpack_resized_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)653 __global__ void yaksuri_cudai_kernel_unpack_resized_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
654 {
655     const char *__restrict__ sbuf = (const char *) inbuf;
656     char *__restrict__ dbuf = (char *) outbuf;
657     uintptr_t extent = md->extent;
658     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
659     uintptr_t res = idx;
660     uintptr_t inner_elements = md->num_elements;
661 
662     if (idx >= (count * inner_elements))
663         return;
664 
665     uintptr_t x0 = res / inner_elements;
666     res %= inner_elements;
667     inner_elements /= md->u.resized.child->u.hindexed.count;
668 
669     uintptr_t x1;
670     for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
671             uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
672                                  md->u.resized.child->u.hindexed.child->num_elements;
673             if (res < in_elems) {
674                     x1 = i;
675                     res %= in_elems;
676                     inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
677                     break;
678             } else {
679                     res -= in_elems;
680             }
681     }
682 
683     uintptr_t x2 = res / inner_elements;
684     res %= inner_elements;
685     inner_elements /= md->u.resized.child->u.hindexed.child->u.contig.count;
686 
687     uintptr_t x3 = res;
688 
689     intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
690     intptr_t stride3 = md->u.resized.child->u.hindexed.child->u.contig.child->extent;
691     uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
692     *((char *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + x3 * stride3)) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
693 }
694 
yaksuri_cudai_unpack_resized_hindexed_contig_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)695 void yaksuri_cudai_unpack_resized_hindexed_contig_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
696 {
697 void *args[] = { &inbuf, &outbuf, &count, &md };
698     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_hindexed_contig_char,
699         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
700     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
701 }
702 
703