1 /*
2  * Copyright (C) by Argonne National Laboratory
3  *     See COPYRIGHT in top-level directory
4  *
5  * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6  */
7 
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16 
yaksuri_cudai_kernel_pack_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19     const char *__restrict__ sbuf = (const char *) inbuf;
20     char *__restrict__ dbuf = (char *) outbuf;
21     uintptr_t extent = md->extent;
22     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23     uintptr_t res = idx;
24     uintptr_t inner_elements = md->num_elements;
25 
26     if (idx >= (count * inner_elements))
27         return;
28 
29     uintptr_t x0 = res / inner_elements;
30     res %= inner_elements;
31     inner_elements /= md->u.resized.child->u.blkhindx.count;
32 
33     uintptr_t x1 = res / inner_elements;
34     res %= inner_elements;
35     inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
36     uintptr_t x2 = res;
37 
38     intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
39     *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * sizeof(int16_t)));
40 }
41 
yaksuri_cudai_pack_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)42 void yaksuri_cudai_pack_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
43 {
44 void *args[] = { &inbuf, &outbuf, &count, &md };
45     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_blkhindx_int16_t,
46         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
47     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
48 }
49 
yaksuri_cudai_kernel_unpack_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)50 __global__ void yaksuri_cudai_kernel_unpack_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
51 {
52     const char *__restrict__ sbuf = (const char *) inbuf;
53     char *__restrict__ dbuf = (char *) outbuf;
54     uintptr_t extent = md->extent;
55     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
56     uintptr_t res = idx;
57     uintptr_t inner_elements = md->num_elements;
58 
59     if (idx >= (count * inner_elements))
60         return;
61 
62     uintptr_t x0 = res / inner_elements;
63     res %= inner_elements;
64     inner_elements /= md->u.resized.child->u.blkhindx.count;
65 
66     uintptr_t x1 = res / inner_elements;
67     res %= inner_elements;
68     inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
69     uintptr_t x2 = res;
70 
71     intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
72     *((int16_t *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
73 }
74 
yaksuri_cudai_unpack_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)75 void yaksuri_cudai_unpack_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
76 {
77 void *args[] = { &inbuf, &outbuf, &count, &md };
78     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_blkhindx_int16_t,
79         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
80     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
81 }
82 
yaksuri_cudai_kernel_pack_hvector_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)83 __global__ void yaksuri_cudai_kernel_pack_hvector_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
84 {
85     const char *__restrict__ sbuf = (const char *) inbuf;
86     char *__restrict__ dbuf = (char *) outbuf;
87     uintptr_t extent = md->extent;
88     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
89     uintptr_t res = idx;
90     uintptr_t inner_elements = md->num_elements;
91 
92     if (idx >= (count * inner_elements))
93         return;
94 
95     uintptr_t x0 = res / inner_elements;
96     res %= inner_elements;
97     inner_elements /= md->u.hvector.count;
98 
99     uintptr_t x1 = res / inner_elements;
100     res %= inner_elements;
101     inner_elements /= md->u.hvector.blocklength;
102     uintptr_t x2 = res / inner_elements;
103     res %= inner_elements;
104     inner_elements /= md->u.hvector.child->u.resized.child->u.blkhindx.count;
105 
106     uintptr_t x3 = res / inner_elements;
107     res %= inner_elements;
108     inner_elements /= md->u.hvector.child->u.resized.child->u.blkhindx.blocklength;
109     uintptr_t x4 = res;
110 
111     intptr_t stride1 = md->u.hvector.stride;
112     uintptr_t extent2 = md->u.hvector.child->extent;
113     intptr_t *array_of_displs3 = md->u.hvector.child->u.resized.child->u.blkhindx.array_of_displs;
114     *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int16_t)));
115 }
116 
yaksuri_cudai_pack_hvector_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)117 void yaksuri_cudai_pack_hvector_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
118 {
119 void *args[] = { &inbuf, &outbuf, &count, &md };
120     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_resized_blkhindx_int16_t,
121         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
122     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
123 }
124 
yaksuri_cudai_kernel_unpack_hvector_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)125 __global__ void yaksuri_cudai_kernel_unpack_hvector_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
126 {
127     const char *__restrict__ sbuf = (const char *) inbuf;
128     char *__restrict__ dbuf = (char *) outbuf;
129     uintptr_t extent = md->extent;
130     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
131     uintptr_t res = idx;
132     uintptr_t inner_elements = md->num_elements;
133 
134     if (idx >= (count * inner_elements))
135         return;
136 
137     uintptr_t x0 = res / inner_elements;
138     res %= inner_elements;
139     inner_elements /= md->u.hvector.count;
140 
141     uintptr_t x1 = res / inner_elements;
142     res %= inner_elements;
143     inner_elements /= md->u.hvector.blocklength;
144     uintptr_t x2 = res / inner_elements;
145     res %= inner_elements;
146     inner_elements /= md->u.hvector.child->u.resized.child->u.blkhindx.count;
147 
148     uintptr_t x3 = res / inner_elements;
149     res %= inner_elements;
150     inner_elements /= md->u.hvector.child->u.resized.child->u.blkhindx.blocklength;
151     uintptr_t x4 = res;
152 
153     intptr_t stride1 = md->u.hvector.stride;
154     uintptr_t extent2 = md->u.hvector.child->extent;
155     intptr_t *array_of_displs3 = md->u.hvector.child->u.resized.child->u.blkhindx.array_of_displs;
156     *((int16_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
157 }
158 
yaksuri_cudai_unpack_hvector_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)159 void yaksuri_cudai_unpack_hvector_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
160 {
161 void *args[] = { &inbuf, &outbuf, &count, &md };
162     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_resized_blkhindx_int16_t,
163         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
164     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
165 }
166 
yaksuri_cudai_kernel_pack_blkhindx_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)167 __global__ void yaksuri_cudai_kernel_pack_blkhindx_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
168 {
169     const char *__restrict__ sbuf = (const char *) inbuf;
170     char *__restrict__ dbuf = (char *) outbuf;
171     uintptr_t extent = md->extent;
172     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
173     uintptr_t res = idx;
174     uintptr_t inner_elements = md->num_elements;
175 
176     if (idx >= (count * inner_elements))
177         return;
178 
179     uintptr_t x0 = res / inner_elements;
180     res %= inner_elements;
181     inner_elements /= md->u.blkhindx.count;
182 
183     uintptr_t x1 = res / inner_elements;
184     res %= inner_elements;
185     inner_elements /= md->u.blkhindx.blocklength;
186     uintptr_t x2 = res / inner_elements;
187     res %= inner_elements;
188     inner_elements /= md->u.blkhindx.child->u.resized.child->u.blkhindx.count;
189 
190     uintptr_t x3 = res / inner_elements;
191     res %= inner_elements;
192     inner_elements /= md->u.blkhindx.child->u.resized.child->u.blkhindx.blocklength;
193     uintptr_t x4 = res;
194 
195     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
196     uintptr_t extent2 = md->u.blkhindx.child->extent;
197     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.resized.child->u.blkhindx.array_of_displs;
198     *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int16_t)));
199 }
200 
yaksuri_cudai_pack_blkhindx_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)201 void yaksuri_cudai_pack_blkhindx_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
202 {
203 void *args[] = { &inbuf, &outbuf, &count, &md };
204     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_resized_blkhindx_int16_t,
205         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
206     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
207 }
208 
yaksuri_cudai_kernel_unpack_blkhindx_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)209 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
210 {
211     const char *__restrict__ sbuf = (const char *) inbuf;
212     char *__restrict__ dbuf = (char *) outbuf;
213     uintptr_t extent = md->extent;
214     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
215     uintptr_t res = idx;
216     uintptr_t inner_elements = md->num_elements;
217 
218     if (idx >= (count * inner_elements))
219         return;
220 
221     uintptr_t x0 = res / inner_elements;
222     res %= inner_elements;
223     inner_elements /= md->u.blkhindx.count;
224 
225     uintptr_t x1 = res / inner_elements;
226     res %= inner_elements;
227     inner_elements /= md->u.blkhindx.blocklength;
228     uintptr_t x2 = res / inner_elements;
229     res %= inner_elements;
230     inner_elements /= md->u.blkhindx.child->u.resized.child->u.blkhindx.count;
231 
232     uintptr_t x3 = res / inner_elements;
233     res %= inner_elements;
234     inner_elements /= md->u.blkhindx.child->u.resized.child->u.blkhindx.blocklength;
235     uintptr_t x4 = res;
236 
237     intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
238     uintptr_t extent2 = md->u.blkhindx.child->extent;
239     intptr_t *array_of_displs3 = md->u.blkhindx.child->u.resized.child->u.blkhindx.array_of_displs;
240     *((int16_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
241 }
242 
yaksuri_cudai_unpack_blkhindx_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)243 void yaksuri_cudai_unpack_blkhindx_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
244 {
245 void *args[] = { &inbuf, &outbuf, &count, &md };
246     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_resized_blkhindx_int16_t,
247         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
248     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
249 }
250 
yaksuri_cudai_kernel_pack_hindexed_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)251 __global__ void yaksuri_cudai_kernel_pack_hindexed_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
252 {
253     const char *__restrict__ sbuf = (const char *) inbuf;
254     char *__restrict__ dbuf = (char *) outbuf;
255     uintptr_t extent = md->extent;
256     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
257     uintptr_t res = idx;
258     uintptr_t inner_elements = md->num_elements;
259 
260     if (idx >= (count * inner_elements))
261         return;
262 
263     uintptr_t x0 = res / inner_elements;
264     res %= inner_elements;
265     inner_elements /= md->u.hindexed.count;
266 
267     uintptr_t x1;
268     for (int i = 0; i < md->u.hindexed.count; i++) {
269             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
270                                  md->u.hindexed.child->num_elements;
271             if (res < in_elems) {
272                     x1 = i;
273                     res %= in_elems;
274                     inner_elements = md->u.hindexed.child->num_elements;
275                     break;
276             } else {
277                     res -= in_elems;
278             }
279     }
280 
281     uintptr_t x2 = res / inner_elements;
282     res %= inner_elements;
283     inner_elements /= md->u.hindexed.child->u.resized.child->u.blkhindx.count;
284 
285     uintptr_t x3 = res / inner_elements;
286     res %= inner_elements;
287     inner_elements /= md->u.hindexed.child->u.resized.child->u.blkhindx.blocklength;
288     uintptr_t x4 = res;
289 
290     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
291     uintptr_t extent2 = md->u.hindexed.child->extent;
292     intptr_t *array_of_displs3 = md->u.hindexed.child->u.resized.child->u.blkhindx.array_of_displs;
293     *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int16_t)));
294 }
295 
yaksuri_cudai_pack_hindexed_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)296 void yaksuri_cudai_pack_hindexed_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
297 {
298 void *args[] = { &inbuf, &outbuf, &count, &md };
299     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_resized_blkhindx_int16_t,
300         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
301     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
302 }
303 
yaksuri_cudai_kernel_unpack_hindexed_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)304 __global__ void yaksuri_cudai_kernel_unpack_hindexed_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
305 {
306     const char *__restrict__ sbuf = (const char *) inbuf;
307     char *__restrict__ dbuf = (char *) outbuf;
308     uintptr_t extent = md->extent;
309     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
310     uintptr_t res = idx;
311     uintptr_t inner_elements = md->num_elements;
312 
313     if (idx >= (count * inner_elements))
314         return;
315 
316     uintptr_t x0 = res / inner_elements;
317     res %= inner_elements;
318     inner_elements /= md->u.hindexed.count;
319 
320     uintptr_t x1;
321     for (int i = 0; i < md->u.hindexed.count; i++) {
322             uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
323                                  md->u.hindexed.child->num_elements;
324             if (res < in_elems) {
325                     x1 = i;
326                     res %= in_elems;
327                     inner_elements = md->u.hindexed.child->num_elements;
328                     break;
329             } else {
330                     res -= in_elems;
331             }
332     }
333 
334     uintptr_t x2 = res / inner_elements;
335     res %= inner_elements;
336     inner_elements /= md->u.hindexed.child->u.resized.child->u.blkhindx.count;
337 
338     uintptr_t x3 = res / inner_elements;
339     res %= inner_elements;
340     inner_elements /= md->u.hindexed.child->u.resized.child->u.blkhindx.blocklength;
341     uintptr_t x4 = res;
342 
343     intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
344     uintptr_t extent2 = md->u.hindexed.child->extent;
345     intptr_t *array_of_displs3 = md->u.hindexed.child->u.resized.child->u.blkhindx.array_of_displs;
346     *((int16_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
347 }
348 
yaksuri_cudai_unpack_hindexed_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)349 void yaksuri_cudai_unpack_hindexed_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
350 {
351 void *args[] = { &inbuf, &outbuf, &count, &md };
352     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_resized_blkhindx_int16_t,
353         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
354     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
355 }
356 
yaksuri_cudai_kernel_pack_contig_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)357 __global__ void yaksuri_cudai_kernel_pack_contig_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
358 {
359     const char *__restrict__ sbuf = (const char *) inbuf;
360     char *__restrict__ dbuf = (char *) outbuf;
361     uintptr_t extent = md->extent;
362     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
363     uintptr_t res = idx;
364     uintptr_t inner_elements = md->num_elements;
365 
366     if (idx >= (count * inner_elements))
367         return;
368 
369     uintptr_t x0 = res / inner_elements;
370     res %= inner_elements;
371     inner_elements /= md->u.contig.count;
372 
373     uintptr_t x1 = res / inner_elements;
374     res %= inner_elements;
375     inner_elements /= md->u.contig.child->u.resized.child->u.blkhindx.count;
376 
377     uintptr_t x2 = res / inner_elements;
378     res %= inner_elements;
379     inner_elements /= md->u.contig.child->u.resized.child->u.blkhindx.blocklength;
380     uintptr_t x3 = res;
381 
382     intptr_t stride1 = md->u.contig.child->extent;
383     intptr_t *array_of_displs3 = md->u.contig.child->u.resized.child->u.blkhindx.array_of_displs;
384     *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs3[x2] + x3 * sizeof(int16_t)));
385 }
386 
yaksuri_cudai_pack_contig_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)387 void yaksuri_cudai_pack_contig_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
388 {
389 void *args[] = { &inbuf, &outbuf, &count, &md };
390     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_resized_blkhindx_int16_t,
391         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
392     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
393 }
394 
yaksuri_cudai_kernel_unpack_contig_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)395 __global__ void yaksuri_cudai_kernel_unpack_contig_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
396 {
397     const char *__restrict__ sbuf = (const char *) inbuf;
398     char *__restrict__ dbuf = (char *) outbuf;
399     uintptr_t extent = md->extent;
400     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
401     uintptr_t res = idx;
402     uintptr_t inner_elements = md->num_elements;
403 
404     if (idx >= (count * inner_elements))
405         return;
406 
407     uintptr_t x0 = res / inner_elements;
408     res %= inner_elements;
409     inner_elements /= md->u.contig.count;
410 
411     uintptr_t x1 = res / inner_elements;
412     res %= inner_elements;
413     inner_elements /= md->u.contig.child->u.resized.child->u.blkhindx.count;
414 
415     uintptr_t x2 = res / inner_elements;
416     res %= inner_elements;
417     inner_elements /= md->u.contig.child->u.resized.child->u.blkhindx.blocklength;
418     uintptr_t x3 = res;
419 
420     intptr_t stride1 = md->u.contig.child->extent;
421     intptr_t *array_of_displs3 = md->u.contig.child->u.resized.child->u.blkhindx.array_of_displs;
422     *((int16_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs3[x2] + x3 * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
423 }
424 
yaksuri_cudai_unpack_contig_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)425 void yaksuri_cudai_unpack_contig_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
426 {
427 void *args[] = { &inbuf, &outbuf, &count, &md };
428     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_resized_blkhindx_int16_t,
429         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
430     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
431 }
432 
yaksuri_cudai_kernel_pack_resized_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)433 __global__ void yaksuri_cudai_kernel_pack_resized_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
434 {
435     const char *__restrict__ sbuf = (const char *) inbuf;
436     char *__restrict__ dbuf = (char *) outbuf;
437     uintptr_t extent = md->extent;
438     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
439     uintptr_t res = idx;
440     uintptr_t inner_elements = md->num_elements;
441 
442     if (idx >= (count * inner_elements))
443         return;
444 
445     uintptr_t x0 = res / inner_elements;
446     res %= inner_elements;
447     inner_elements /= md->u.resized.child->u.resized.child->u.blkhindx.count;
448 
449     uintptr_t x1 = res / inner_elements;
450     res %= inner_elements;
451     inner_elements /= md->u.resized.child->u.resized.child->u.blkhindx.blocklength;
452     uintptr_t x2 = res;
453 
454     intptr_t *array_of_displs3 = md->u.resized.child->u.resized.child->u.blkhindx.array_of_displs;
455     *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + array_of_displs3[x1] + x2 * sizeof(int16_t)));
456 }
457 
yaksuri_cudai_pack_resized_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)458 void yaksuri_cudai_pack_resized_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
459 {
460 void *args[] = { &inbuf, &outbuf, &count, &md };
461     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_resized_blkhindx_int16_t,
462         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
463     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
464 }
465 
yaksuri_cudai_kernel_unpack_resized_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)466 __global__ void yaksuri_cudai_kernel_unpack_resized_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
467 {
468     const char *__restrict__ sbuf = (const char *) inbuf;
469     char *__restrict__ dbuf = (char *) outbuf;
470     uintptr_t extent = md->extent;
471     uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
472     uintptr_t res = idx;
473     uintptr_t inner_elements = md->num_elements;
474 
475     if (idx >= (count * inner_elements))
476         return;
477 
478     uintptr_t x0 = res / inner_elements;
479     res %= inner_elements;
480     inner_elements /= md->u.resized.child->u.resized.child->u.blkhindx.count;
481 
482     uintptr_t x1 = res / inner_elements;
483     res %= inner_elements;
484     inner_elements /= md->u.resized.child->u.resized.child->u.blkhindx.blocklength;
485     uintptr_t x2 = res;
486 
487     intptr_t *array_of_displs3 = md->u.resized.child->u.resized.child->u.blkhindx.array_of_displs;
488     *((int16_t *) (void *) (dbuf + x0 * extent + array_of_displs3[x1] + x2 * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
489 }
490 
yaksuri_cudai_unpack_resized_resized_blkhindx_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)491 void yaksuri_cudai_unpack_resized_resized_blkhindx_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
492 {
493 void *args[] = { &inbuf, &outbuf, &count, &md };
494     cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_resized_blkhindx_int16_t,
495         dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
496     YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
497 }
498 
499