1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.hindexed.count;
32
33 uintptr_t x1;
34 for (int i = 0; i < md->u.hindexed.count; i++) {
35 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
36 md->u.hindexed.child->num_elements;
37 if (res < in_elems) {
38 x1 = i;
39 res %= in_elems;
40 inner_elements = md->u.hindexed.child->num_elements;
41 break;
42 } else {
43 res -= in_elems;
44 }
45 }
46
47 uintptr_t x2 = res;
48
49 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
50 uintptr_t extent2 = md->u.hindexed.child->extent;
51 *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2));
52 }
53
yaksuri_cudai_pack_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)54 void yaksuri_cudai_pack_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
55 {
56 void *args[] = { &inbuf, &outbuf, &count, &md };
57 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_resized_int16_t,
58 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
59 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
60 }
61
yaksuri_cudai_kernel_unpack_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)62 __global__ void yaksuri_cudai_kernel_unpack_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
63 {
64 const char *__restrict__ sbuf = (const char *) inbuf;
65 char *__restrict__ dbuf = (char *) outbuf;
66 uintptr_t extent = md->extent;
67 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
68 uintptr_t res = idx;
69 uintptr_t inner_elements = md->num_elements;
70
71 if (idx >= (count * inner_elements))
72 return;
73
74 uintptr_t x0 = res / inner_elements;
75 res %= inner_elements;
76 inner_elements /= md->u.hindexed.count;
77
78 uintptr_t x1;
79 for (int i = 0; i < md->u.hindexed.count; i++) {
80 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
81 md->u.hindexed.child->num_elements;
82 if (res < in_elems) {
83 x1 = i;
84 res %= in_elems;
85 inner_elements = md->u.hindexed.child->num_elements;
86 break;
87 } else {
88 res -= in_elems;
89 }
90 }
91
92 uintptr_t x2 = res;
93
94 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
95 uintptr_t extent2 = md->u.hindexed.child->extent;
96 *((int16_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2)) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
97 }
98
yaksuri_cudai_unpack_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)99 void yaksuri_cudai_unpack_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
100 {
101 void *args[] = { &inbuf, &outbuf, &count, &md };
102 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_resized_int16_t,
103 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
104 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
105 }
106
yaksuri_cudai_kernel_pack_hvector_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)107 __global__ void yaksuri_cudai_kernel_pack_hvector_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
108 {
109 const char *__restrict__ sbuf = (const char *) inbuf;
110 char *__restrict__ dbuf = (char *) outbuf;
111 uintptr_t extent = md->extent;
112 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
113 uintptr_t res = idx;
114 uintptr_t inner_elements = md->num_elements;
115
116 if (idx >= (count * inner_elements))
117 return;
118
119 uintptr_t x0 = res / inner_elements;
120 res %= inner_elements;
121 inner_elements /= md->u.hvector.count;
122
123 uintptr_t x1 = res / inner_elements;
124 res %= inner_elements;
125 inner_elements /= md->u.hvector.blocklength;
126 uintptr_t x2 = res / inner_elements;
127 res %= inner_elements;
128 inner_elements /= md->u.hvector.child->u.hindexed.count;
129
130 uintptr_t x3;
131 for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
132 uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
133 md->u.hvector.child->u.hindexed.child->num_elements;
134 if (res < in_elems) {
135 x3 = i;
136 res %= in_elems;
137 inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
138 break;
139 } else {
140 res -= in_elems;
141 }
142 }
143
144 uintptr_t x4 = res;
145
146 intptr_t stride1 = md->u.hvector.stride;
147 intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
148 uintptr_t extent2 = md->u.hvector.child->extent;
149 uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
150 *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3));
151 }
152
yaksuri_cudai_pack_hvector_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)153 void yaksuri_cudai_pack_hvector_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
154 {
155 void *args[] = { &inbuf, &outbuf, &count, &md };
156 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_hindexed_resized_int16_t,
157 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
158 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
159 }
160
yaksuri_cudai_kernel_unpack_hvector_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)161 __global__ void yaksuri_cudai_kernel_unpack_hvector_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
162 {
163 const char *__restrict__ sbuf = (const char *) inbuf;
164 char *__restrict__ dbuf = (char *) outbuf;
165 uintptr_t extent = md->extent;
166 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
167 uintptr_t res = idx;
168 uintptr_t inner_elements = md->num_elements;
169
170 if (idx >= (count * inner_elements))
171 return;
172
173 uintptr_t x0 = res / inner_elements;
174 res %= inner_elements;
175 inner_elements /= md->u.hvector.count;
176
177 uintptr_t x1 = res / inner_elements;
178 res %= inner_elements;
179 inner_elements /= md->u.hvector.blocklength;
180 uintptr_t x2 = res / inner_elements;
181 res %= inner_elements;
182 inner_elements /= md->u.hvector.child->u.hindexed.count;
183
184 uintptr_t x3;
185 for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
186 uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
187 md->u.hvector.child->u.hindexed.child->num_elements;
188 if (res < in_elems) {
189 x3 = i;
190 res %= in_elems;
191 inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
192 break;
193 } else {
194 res -= in_elems;
195 }
196 }
197
198 uintptr_t x4 = res;
199
200 intptr_t stride1 = md->u.hvector.stride;
201 intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
202 uintptr_t extent2 = md->u.hvector.child->extent;
203 uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
204 *((int16_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3)) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
205 }
206
yaksuri_cudai_unpack_hvector_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)207 void yaksuri_cudai_unpack_hvector_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
208 {
209 void *args[] = { &inbuf, &outbuf, &count, &md };
210 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_hindexed_resized_int16_t,
211 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
212 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
213 }
214
yaksuri_cudai_kernel_pack_blkhindx_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)215 __global__ void yaksuri_cudai_kernel_pack_blkhindx_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
216 {
217 const char *__restrict__ sbuf = (const char *) inbuf;
218 char *__restrict__ dbuf = (char *) outbuf;
219 uintptr_t extent = md->extent;
220 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
221 uintptr_t res = idx;
222 uintptr_t inner_elements = md->num_elements;
223
224 if (idx >= (count * inner_elements))
225 return;
226
227 uintptr_t x0 = res / inner_elements;
228 res %= inner_elements;
229 inner_elements /= md->u.blkhindx.count;
230
231 uintptr_t x1 = res / inner_elements;
232 res %= inner_elements;
233 inner_elements /= md->u.blkhindx.blocklength;
234 uintptr_t x2 = res / inner_elements;
235 res %= inner_elements;
236 inner_elements /= md->u.blkhindx.child->u.hindexed.count;
237
238 uintptr_t x3;
239 for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
240 uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
241 md->u.blkhindx.child->u.hindexed.child->num_elements;
242 if (res < in_elems) {
243 x3 = i;
244 res %= in_elems;
245 inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
246 break;
247 } else {
248 res -= in_elems;
249 }
250 }
251
252 uintptr_t x4 = res;
253
254 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
255 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
256 uintptr_t extent2 = md->u.blkhindx.child->extent;
257 uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
258 *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3));
259 }
260
yaksuri_cudai_pack_blkhindx_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)261 void yaksuri_cudai_pack_blkhindx_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
262 {
263 void *args[] = { &inbuf, &outbuf, &count, &md };
264 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_hindexed_resized_int16_t,
265 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
266 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
267 }
268
yaksuri_cudai_kernel_unpack_blkhindx_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)269 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
270 {
271 const char *__restrict__ sbuf = (const char *) inbuf;
272 char *__restrict__ dbuf = (char *) outbuf;
273 uintptr_t extent = md->extent;
274 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
275 uintptr_t res = idx;
276 uintptr_t inner_elements = md->num_elements;
277
278 if (idx >= (count * inner_elements))
279 return;
280
281 uintptr_t x0 = res / inner_elements;
282 res %= inner_elements;
283 inner_elements /= md->u.blkhindx.count;
284
285 uintptr_t x1 = res / inner_elements;
286 res %= inner_elements;
287 inner_elements /= md->u.blkhindx.blocklength;
288 uintptr_t x2 = res / inner_elements;
289 res %= inner_elements;
290 inner_elements /= md->u.blkhindx.child->u.hindexed.count;
291
292 uintptr_t x3;
293 for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
294 uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
295 md->u.blkhindx.child->u.hindexed.child->num_elements;
296 if (res < in_elems) {
297 x3 = i;
298 res %= in_elems;
299 inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
300 break;
301 } else {
302 res -= in_elems;
303 }
304 }
305
306 uintptr_t x4 = res;
307
308 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
309 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
310 uintptr_t extent2 = md->u.blkhindx.child->extent;
311 uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
312 *((int16_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3)) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
313 }
314
yaksuri_cudai_unpack_blkhindx_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)315 void yaksuri_cudai_unpack_blkhindx_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
316 {
317 void *args[] = { &inbuf, &outbuf, &count, &md };
318 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_hindexed_resized_int16_t,
319 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
320 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
321 }
322
yaksuri_cudai_kernel_pack_hindexed_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)323 __global__ void yaksuri_cudai_kernel_pack_hindexed_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
324 {
325 const char *__restrict__ sbuf = (const char *) inbuf;
326 char *__restrict__ dbuf = (char *) outbuf;
327 uintptr_t extent = md->extent;
328 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
329 uintptr_t res = idx;
330 uintptr_t inner_elements = md->num_elements;
331
332 if (idx >= (count * inner_elements))
333 return;
334
335 uintptr_t x0 = res / inner_elements;
336 res %= inner_elements;
337 inner_elements /= md->u.hindexed.count;
338
339 uintptr_t x1;
340 for (int i = 0; i < md->u.hindexed.count; i++) {
341 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
342 md->u.hindexed.child->num_elements;
343 if (res < in_elems) {
344 x1 = i;
345 res %= in_elems;
346 inner_elements = md->u.hindexed.child->num_elements;
347 break;
348 } else {
349 res -= in_elems;
350 }
351 }
352
353 uintptr_t x2 = res / inner_elements;
354 res %= inner_elements;
355 inner_elements /= md->u.hindexed.child->u.hindexed.count;
356
357 uintptr_t x3;
358 for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
359 uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
360 md->u.hindexed.child->u.hindexed.child->num_elements;
361 if (res < in_elems) {
362 x3 = i;
363 res %= in_elems;
364 inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
365 break;
366 } else {
367 res -= in_elems;
368 }
369 }
370
371 uintptr_t x4 = res;
372
373 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
374 intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
375 uintptr_t extent2 = md->u.hindexed.child->extent;
376 uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
377 *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3));
378 }
379
yaksuri_cudai_pack_hindexed_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)380 void yaksuri_cudai_pack_hindexed_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
381 {
382 void *args[] = { &inbuf, &outbuf, &count, &md };
383 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_hindexed_resized_int16_t,
384 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
385 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
386 }
387
yaksuri_cudai_kernel_unpack_hindexed_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)388 __global__ void yaksuri_cudai_kernel_unpack_hindexed_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
389 {
390 const char *__restrict__ sbuf = (const char *) inbuf;
391 char *__restrict__ dbuf = (char *) outbuf;
392 uintptr_t extent = md->extent;
393 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
394 uintptr_t res = idx;
395 uintptr_t inner_elements = md->num_elements;
396
397 if (idx >= (count * inner_elements))
398 return;
399
400 uintptr_t x0 = res / inner_elements;
401 res %= inner_elements;
402 inner_elements /= md->u.hindexed.count;
403
404 uintptr_t x1;
405 for (int i = 0; i < md->u.hindexed.count; i++) {
406 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
407 md->u.hindexed.child->num_elements;
408 if (res < in_elems) {
409 x1 = i;
410 res %= in_elems;
411 inner_elements = md->u.hindexed.child->num_elements;
412 break;
413 } else {
414 res -= in_elems;
415 }
416 }
417
418 uintptr_t x2 = res / inner_elements;
419 res %= inner_elements;
420 inner_elements /= md->u.hindexed.child->u.hindexed.count;
421
422 uintptr_t x3;
423 for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
424 uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
425 md->u.hindexed.child->u.hindexed.child->num_elements;
426 if (res < in_elems) {
427 x3 = i;
428 res %= in_elems;
429 inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
430 break;
431 } else {
432 res -= in_elems;
433 }
434 }
435
436 uintptr_t x4 = res;
437
438 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
439 intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
440 uintptr_t extent2 = md->u.hindexed.child->extent;
441 uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
442 *((int16_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3)) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
443 }
444
yaksuri_cudai_unpack_hindexed_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)445 void yaksuri_cudai_unpack_hindexed_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
446 {
447 void *args[] = { &inbuf, &outbuf, &count, &md };
448 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_hindexed_resized_int16_t,
449 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
450 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
451 }
452
yaksuri_cudai_kernel_pack_contig_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)453 __global__ void yaksuri_cudai_kernel_pack_contig_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
454 {
455 const char *__restrict__ sbuf = (const char *) inbuf;
456 char *__restrict__ dbuf = (char *) outbuf;
457 uintptr_t extent = md->extent;
458 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
459 uintptr_t res = idx;
460 uintptr_t inner_elements = md->num_elements;
461
462 if (idx >= (count * inner_elements))
463 return;
464
465 uintptr_t x0 = res / inner_elements;
466 res %= inner_elements;
467 inner_elements /= md->u.contig.count;
468
469 uintptr_t x1 = res / inner_elements;
470 res %= inner_elements;
471 inner_elements /= md->u.contig.child->u.hindexed.count;
472
473 uintptr_t x2;
474 for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
475 uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
476 md->u.contig.child->u.hindexed.child->num_elements;
477 if (res < in_elems) {
478 x2 = i;
479 res %= in_elems;
480 inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
481 break;
482 } else {
483 res -= in_elems;
484 }
485 }
486
487 uintptr_t x3 = res;
488
489 intptr_t stride1 = md->u.contig.child->extent;
490 intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
491 uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
492 *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3));
493 }
494
yaksuri_cudai_pack_contig_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)495 void yaksuri_cudai_pack_contig_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
496 {
497 void *args[] = { &inbuf, &outbuf, &count, &md };
498 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_hindexed_resized_int16_t,
499 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
500 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
501 }
502
yaksuri_cudai_kernel_unpack_contig_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)503 __global__ void yaksuri_cudai_kernel_unpack_contig_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
504 {
505 const char *__restrict__ sbuf = (const char *) inbuf;
506 char *__restrict__ dbuf = (char *) outbuf;
507 uintptr_t extent = md->extent;
508 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
509 uintptr_t res = idx;
510 uintptr_t inner_elements = md->num_elements;
511
512 if (idx >= (count * inner_elements))
513 return;
514
515 uintptr_t x0 = res / inner_elements;
516 res %= inner_elements;
517 inner_elements /= md->u.contig.count;
518
519 uintptr_t x1 = res / inner_elements;
520 res %= inner_elements;
521 inner_elements /= md->u.contig.child->u.hindexed.count;
522
523 uintptr_t x2;
524 for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
525 uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
526 md->u.contig.child->u.hindexed.child->num_elements;
527 if (res < in_elems) {
528 x2 = i;
529 res %= in_elems;
530 inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
531 break;
532 } else {
533 res -= in_elems;
534 }
535 }
536
537 uintptr_t x3 = res;
538
539 intptr_t stride1 = md->u.contig.child->extent;
540 intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
541 uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
542 *((int16_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3)) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
543 }
544
yaksuri_cudai_unpack_contig_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)545 void yaksuri_cudai_unpack_contig_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
546 {
547 void *args[] = { &inbuf, &outbuf, &count, &md };
548 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_hindexed_resized_int16_t,
549 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
550 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
551 }
552
yaksuri_cudai_kernel_pack_resized_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)553 __global__ void yaksuri_cudai_kernel_pack_resized_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
554 {
555 const char *__restrict__ sbuf = (const char *) inbuf;
556 char *__restrict__ dbuf = (char *) outbuf;
557 uintptr_t extent = md->extent;
558 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
559 uintptr_t res = idx;
560 uintptr_t inner_elements = md->num_elements;
561
562 if (idx >= (count * inner_elements))
563 return;
564
565 uintptr_t x0 = res / inner_elements;
566 res %= inner_elements;
567 inner_elements /= md->u.resized.child->u.hindexed.count;
568
569 uintptr_t x1;
570 for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
571 uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
572 md->u.resized.child->u.hindexed.child->num_elements;
573 if (res < in_elems) {
574 x1 = i;
575 res %= in_elems;
576 inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
577 break;
578 } else {
579 res -= in_elems;
580 }
581 }
582
583 uintptr_t x2 = res;
584
585 intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
586 uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
587 *((int16_t *) (void *) (dbuf + idx * sizeof(int16_t))) = *((const int16_t *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3));
588 }
589
yaksuri_cudai_pack_resized_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)590 void yaksuri_cudai_pack_resized_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
591 {
592 void *args[] = { &inbuf, &outbuf, &count, &md };
593 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_hindexed_resized_int16_t,
594 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
595 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
596 }
597
yaksuri_cudai_kernel_unpack_resized_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)598 __global__ void yaksuri_cudai_kernel_unpack_resized_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
599 {
600 const char *__restrict__ sbuf = (const char *) inbuf;
601 char *__restrict__ dbuf = (char *) outbuf;
602 uintptr_t extent = md->extent;
603 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
604 uintptr_t res = idx;
605 uintptr_t inner_elements = md->num_elements;
606
607 if (idx >= (count * inner_elements))
608 return;
609
610 uintptr_t x0 = res / inner_elements;
611 res %= inner_elements;
612 inner_elements /= md->u.resized.child->u.hindexed.count;
613
614 uintptr_t x1;
615 for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
616 uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
617 md->u.resized.child->u.hindexed.child->num_elements;
618 if (res < in_elems) {
619 x1 = i;
620 res %= in_elems;
621 inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
622 break;
623 } else {
624 res -= in_elems;
625 }
626 }
627
628 uintptr_t x2 = res;
629
630 intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
631 uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
632 *((int16_t *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3)) = *((const int16_t *) (const void *) (sbuf + idx * sizeof(int16_t)));
633 }
634
yaksuri_cudai_unpack_resized_hindexed_resized_int16_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)635 void yaksuri_cudai_unpack_resized_hindexed_resized_int16_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
636 {
637 void *args[] = { &inbuf, &outbuf, &count, &md };
638 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_hindexed_resized_int16_t,
639 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
640 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
641 }
642
643