1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.resized.child->u.hindexed.count;
32
33 uintptr_t x1;
34 for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
35 uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
36 md->u.resized.child->u.hindexed.child->num_elements;
37 if (res < in_elems) {
38 x1 = i;
39 res %= in_elems;
40 inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
41 break;
42 } else {
43 res -= in_elems;
44 }
45 }
46
47 uintptr_t x2 = res;
48
49 intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
50 *((int8_t *) (void *) (dbuf + idx * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * sizeof(int8_t)));
51 }
52
yaksuri_cudai_pack_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)53 void yaksuri_cudai_pack_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
54 {
55 void *args[] = { &inbuf, &outbuf, &count, &md };
56 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_hindexed_int8_t,
57 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
58 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
59 }
60
yaksuri_cudai_kernel_unpack_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)61 __global__ void yaksuri_cudai_kernel_unpack_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
62 {
63 const char *__restrict__ sbuf = (const char *) inbuf;
64 char *__restrict__ dbuf = (char *) outbuf;
65 uintptr_t extent = md->extent;
66 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
67 uintptr_t res = idx;
68 uintptr_t inner_elements = md->num_elements;
69
70 if (idx >= (count * inner_elements))
71 return;
72
73 uintptr_t x0 = res / inner_elements;
74 res %= inner_elements;
75 inner_elements /= md->u.resized.child->u.hindexed.count;
76
77 uintptr_t x1;
78 for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
79 uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
80 md->u.resized.child->u.hindexed.child->num_elements;
81 if (res < in_elems) {
82 x1 = i;
83 res %= in_elems;
84 inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
85 break;
86 } else {
87 res -= in_elems;
88 }
89 }
90
91 uintptr_t x2 = res;
92
93 intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
94 *((int8_t *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + idx * sizeof(int8_t)));
95 }
96
yaksuri_cudai_unpack_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)97 void yaksuri_cudai_unpack_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
98 {
99 void *args[] = { &inbuf, &outbuf, &count, &md };
100 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_hindexed_int8_t,
101 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
102 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
103 }
104
yaksuri_cudai_kernel_pack_hvector_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)105 __global__ void yaksuri_cudai_kernel_pack_hvector_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
106 {
107 const char *__restrict__ sbuf = (const char *) inbuf;
108 char *__restrict__ dbuf = (char *) outbuf;
109 uintptr_t extent = md->extent;
110 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
111 uintptr_t res = idx;
112 uintptr_t inner_elements = md->num_elements;
113
114 if (idx >= (count * inner_elements))
115 return;
116
117 uintptr_t x0 = res / inner_elements;
118 res %= inner_elements;
119 inner_elements /= md->u.hvector.count;
120
121 uintptr_t x1 = res / inner_elements;
122 res %= inner_elements;
123 inner_elements /= md->u.hvector.blocklength;
124 uintptr_t x2 = res / inner_elements;
125 res %= inner_elements;
126 inner_elements /= md->u.hvector.child->u.resized.child->u.hindexed.count;
127
128 uintptr_t x3;
129 for (int i = 0; i < md->u.hvector.child->u.resized.child->u.hindexed.count; i++) {
130 uintptr_t in_elems = md->u.hvector.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
131 md->u.hvector.child->u.resized.child->u.hindexed.child->num_elements;
132 if (res < in_elems) {
133 x3 = i;
134 res %= in_elems;
135 inner_elements = md->u.hvector.child->u.resized.child->u.hindexed.child->num_elements;
136 break;
137 } else {
138 res -= in_elems;
139 }
140 }
141
142 uintptr_t x4 = res;
143
144 intptr_t stride1 = md->u.hvector.stride;
145 uintptr_t extent2 = md->u.hvector.child->extent;
146 intptr_t *array_of_displs3 = md->u.hvector.child->u.resized.child->u.hindexed.array_of_displs;
147 *((int8_t *) (void *) (dbuf + idx * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int8_t)));
148 }
149
yaksuri_cudai_pack_hvector_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)150 void yaksuri_cudai_pack_hvector_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
151 {
152 void *args[] = { &inbuf, &outbuf, &count, &md };
153 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_resized_hindexed_int8_t,
154 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
155 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
156 }
157
yaksuri_cudai_kernel_unpack_hvector_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)158 __global__ void yaksuri_cudai_kernel_unpack_hvector_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
159 {
160 const char *__restrict__ sbuf = (const char *) inbuf;
161 char *__restrict__ dbuf = (char *) outbuf;
162 uintptr_t extent = md->extent;
163 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
164 uintptr_t res = idx;
165 uintptr_t inner_elements = md->num_elements;
166
167 if (idx >= (count * inner_elements))
168 return;
169
170 uintptr_t x0 = res / inner_elements;
171 res %= inner_elements;
172 inner_elements /= md->u.hvector.count;
173
174 uintptr_t x1 = res / inner_elements;
175 res %= inner_elements;
176 inner_elements /= md->u.hvector.blocklength;
177 uintptr_t x2 = res / inner_elements;
178 res %= inner_elements;
179 inner_elements /= md->u.hvector.child->u.resized.child->u.hindexed.count;
180
181 uintptr_t x3;
182 for (int i = 0; i < md->u.hvector.child->u.resized.child->u.hindexed.count; i++) {
183 uintptr_t in_elems = md->u.hvector.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
184 md->u.hvector.child->u.resized.child->u.hindexed.child->num_elements;
185 if (res < in_elems) {
186 x3 = i;
187 res %= in_elems;
188 inner_elements = md->u.hvector.child->u.resized.child->u.hindexed.child->num_elements;
189 break;
190 } else {
191 res -= in_elems;
192 }
193 }
194
195 uintptr_t x4 = res;
196
197 intptr_t stride1 = md->u.hvector.stride;
198 uintptr_t extent2 = md->u.hvector.child->extent;
199 intptr_t *array_of_displs3 = md->u.hvector.child->u.resized.child->u.hindexed.array_of_displs;
200 *((int8_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + idx * sizeof(int8_t)));
201 }
202
yaksuri_cudai_unpack_hvector_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)203 void yaksuri_cudai_unpack_hvector_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
204 {
205 void *args[] = { &inbuf, &outbuf, &count, &md };
206 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_resized_hindexed_int8_t,
207 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
208 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
209 }
210
yaksuri_cudai_kernel_pack_blkhindx_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)211 __global__ void yaksuri_cudai_kernel_pack_blkhindx_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
212 {
213 const char *__restrict__ sbuf = (const char *) inbuf;
214 char *__restrict__ dbuf = (char *) outbuf;
215 uintptr_t extent = md->extent;
216 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
217 uintptr_t res = idx;
218 uintptr_t inner_elements = md->num_elements;
219
220 if (idx >= (count * inner_elements))
221 return;
222
223 uintptr_t x0 = res / inner_elements;
224 res %= inner_elements;
225 inner_elements /= md->u.blkhindx.count;
226
227 uintptr_t x1 = res / inner_elements;
228 res %= inner_elements;
229 inner_elements /= md->u.blkhindx.blocklength;
230 uintptr_t x2 = res / inner_elements;
231 res %= inner_elements;
232 inner_elements /= md->u.blkhindx.child->u.resized.child->u.hindexed.count;
233
234 uintptr_t x3;
235 for (int i = 0; i < md->u.blkhindx.child->u.resized.child->u.hindexed.count; i++) {
236 uintptr_t in_elems = md->u.blkhindx.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
237 md->u.blkhindx.child->u.resized.child->u.hindexed.child->num_elements;
238 if (res < in_elems) {
239 x3 = i;
240 res %= in_elems;
241 inner_elements = md->u.blkhindx.child->u.resized.child->u.hindexed.child->num_elements;
242 break;
243 } else {
244 res -= in_elems;
245 }
246 }
247
248 uintptr_t x4 = res;
249
250 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
251 uintptr_t extent2 = md->u.blkhindx.child->extent;
252 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.resized.child->u.hindexed.array_of_displs;
253 *((int8_t *) (void *) (dbuf + idx * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int8_t)));
254 }
255
yaksuri_cudai_pack_blkhindx_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)256 void yaksuri_cudai_pack_blkhindx_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
257 {
258 void *args[] = { &inbuf, &outbuf, &count, &md };
259 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_resized_hindexed_int8_t,
260 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
261 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
262 }
263
yaksuri_cudai_kernel_unpack_blkhindx_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)264 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
265 {
266 const char *__restrict__ sbuf = (const char *) inbuf;
267 char *__restrict__ dbuf = (char *) outbuf;
268 uintptr_t extent = md->extent;
269 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
270 uintptr_t res = idx;
271 uintptr_t inner_elements = md->num_elements;
272
273 if (idx >= (count * inner_elements))
274 return;
275
276 uintptr_t x0 = res / inner_elements;
277 res %= inner_elements;
278 inner_elements /= md->u.blkhindx.count;
279
280 uintptr_t x1 = res / inner_elements;
281 res %= inner_elements;
282 inner_elements /= md->u.blkhindx.blocklength;
283 uintptr_t x2 = res / inner_elements;
284 res %= inner_elements;
285 inner_elements /= md->u.blkhindx.child->u.resized.child->u.hindexed.count;
286
287 uintptr_t x3;
288 for (int i = 0; i < md->u.blkhindx.child->u.resized.child->u.hindexed.count; i++) {
289 uintptr_t in_elems = md->u.blkhindx.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
290 md->u.blkhindx.child->u.resized.child->u.hindexed.child->num_elements;
291 if (res < in_elems) {
292 x3 = i;
293 res %= in_elems;
294 inner_elements = md->u.blkhindx.child->u.resized.child->u.hindexed.child->num_elements;
295 break;
296 } else {
297 res -= in_elems;
298 }
299 }
300
301 uintptr_t x4 = res;
302
303 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
304 uintptr_t extent2 = md->u.blkhindx.child->extent;
305 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.resized.child->u.hindexed.array_of_displs;
306 *((int8_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + idx * sizeof(int8_t)));
307 }
308
yaksuri_cudai_unpack_blkhindx_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)309 void yaksuri_cudai_unpack_blkhindx_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
310 {
311 void *args[] = { &inbuf, &outbuf, &count, &md };
312 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_resized_hindexed_int8_t,
313 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
314 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
315 }
316
yaksuri_cudai_kernel_pack_hindexed_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)317 __global__ void yaksuri_cudai_kernel_pack_hindexed_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
318 {
319 const char *__restrict__ sbuf = (const char *) inbuf;
320 char *__restrict__ dbuf = (char *) outbuf;
321 uintptr_t extent = md->extent;
322 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
323 uintptr_t res = idx;
324 uintptr_t inner_elements = md->num_elements;
325
326 if (idx >= (count * inner_elements))
327 return;
328
329 uintptr_t x0 = res / inner_elements;
330 res %= inner_elements;
331 inner_elements /= md->u.hindexed.count;
332
333 uintptr_t x1;
334 for (int i = 0; i < md->u.hindexed.count; i++) {
335 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
336 md->u.hindexed.child->num_elements;
337 if (res < in_elems) {
338 x1 = i;
339 res %= in_elems;
340 inner_elements = md->u.hindexed.child->num_elements;
341 break;
342 } else {
343 res -= in_elems;
344 }
345 }
346
347 uintptr_t x2 = res / inner_elements;
348 res %= inner_elements;
349 inner_elements /= md->u.hindexed.child->u.resized.child->u.hindexed.count;
350
351 uintptr_t x3;
352 for (int i = 0; i < md->u.hindexed.child->u.resized.child->u.hindexed.count; i++) {
353 uintptr_t in_elems = md->u.hindexed.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
354 md->u.hindexed.child->u.resized.child->u.hindexed.child->num_elements;
355 if (res < in_elems) {
356 x3 = i;
357 res %= in_elems;
358 inner_elements = md->u.hindexed.child->u.resized.child->u.hindexed.child->num_elements;
359 break;
360 } else {
361 res -= in_elems;
362 }
363 }
364
365 uintptr_t x4 = res;
366
367 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
368 uintptr_t extent2 = md->u.hindexed.child->extent;
369 intptr_t *array_of_displs3 = md->u.hindexed.child->u.resized.child->u.hindexed.array_of_displs;
370 *((int8_t *) (void *) (dbuf + idx * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int8_t)));
371 }
372
yaksuri_cudai_pack_hindexed_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)373 void yaksuri_cudai_pack_hindexed_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
374 {
375 void *args[] = { &inbuf, &outbuf, &count, &md };
376 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_resized_hindexed_int8_t,
377 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
378 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
379 }
380
yaksuri_cudai_kernel_unpack_hindexed_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)381 __global__ void yaksuri_cudai_kernel_unpack_hindexed_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
382 {
383 const char *__restrict__ sbuf = (const char *) inbuf;
384 char *__restrict__ dbuf = (char *) outbuf;
385 uintptr_t extent = md->extent;
386 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
387 uintptr_t res = idx;
388 uintptr_t inner_elements = md->num_elements;
389
390 if (idx >= (count * inner_elements))
391 return;
392
393 uintptr_t x0 = res / inner_elements;
394 res %= inner_elements;
395 inner_elements /= md->u.hindexed.count;
396
397 uintptr_t x1;
398 for (int i = 0; i < md->u.hindexed.count; i++) {
399 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
400 md->u.hindexed.child->num_elements;
401 if (res < in_elems) {
402 x1 = i;
403 res %= in_elems;
404 inner_elements = md->u.hindexed.child->num_elements;
405 break;
406 } else {
407 res -= in_elems;
408 }
409 }
410
411 uintptr_t x2 = res / inner_elements;
412 res %= inner_elements;
413 inner_elements /= md->u.hindexed.child->u.resized.child->u.hindexed.count;
414
415 uintptr_t x3;
416 for (int i = 0; i < md->u.hindexed.child->u.resized.child->u.hindexed.count; i++) {
417 uintptr_t in_elems = md->u.hindexed.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
418 md->u.hindexed.child->u.resized.child->u.hindexed.child->num_elements;
419 if (res < in_elems) {
420 x3 = i;
421 res %= in_elems;
422 inner_elements = md->u.hindexed.child->u.resized.child->u.hindexed.child->num_elements;
423 break;
424 } else {
425 res -= in_elems;
426 }
427 }
428
429 uintptr_t x4 = res;
430
431 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
432 uintptr_t extent2 = md->u.hindexed.child->extent;
433 intptr_t *array_of_displs3 = md->u.hindexed.child->u.resized.child->u.hindexed.array_of_displs;
434 *((int8_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + idx * sizeof(int8_t)));
435 }
436
yaksuri_cudai_unpack_hindexed_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)437 void yaksuri_cudai_unpack_hindexed_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
438 {
439 void *args[] = { &inbuf, &outbuf, &count, &md };
440 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_resized_hindexed_int8_t,
441 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
442 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
443 }
444
yaksuri_cudai_kernel_pack_contig_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)445 __global__ void yaksuri_cudai_kernel_pack_contig_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
446 {
447 const char *__restrict__ sbuf = (const char *) inbuf;
448 char *__restrict__ dbuf = (char *) outbuf;
449 uintptr_t extent = md->extent;
450 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
451 uintptr_t res = idx;
452 uintptr_t inner_elements = md->num_elements;
453
454 if (idx >= (count * inner_elements))
455 return;
456
457 uintptr_t x0 = res / inner_elements;
458 res %= inner_elements;
459 inner_elements /= md->u.contig.count;
460
461 uintptr_t x1 = res / inner_elements;
462 res %= inner_elements;
463 inner_elements /= md->u.contig.child->u.resized.child->u.hindexed.count;
464
465 uintptr_t x2;
466 for (int i = 0; i < md->u.contig.child->u.resized.child->u.hindexed.count; i++) {
467 uintptr_t in_elems = md->u.contig.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
468 md->u.contig.child->u.resized.child->u.hindexed.child->num_elements;
469 if (res < in_elems) {
470 x2 = i;
471 res %= in_elems;
472 inner_elements = md->u.contig.child->u.resized.child->u.hindexed.child->num_elements;
473 break;
474 } else {
475 res -= in_elems;
476 }
477 }
478
479 uintptr_t x3 = res;
480
481 intptr_t stride1 = md->u.contig.child->extent;
482 intptr_t *array_of_displs3 = md->u.contig.child->u.resized.child->u.hindexed.array_of_displs;
483 *((int8_t *) (void *) (dbuf + idx * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs3[x2] + x3 * sizeof(int8_t)));
484 }
485
yaksuri_cudai_pack_contig_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)486 void yaksuri_cudai_pack_contig_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
487 {
488 void *args[] = { &inbuf, &outbuf, &count, &md };
489 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_resized_hindexed_int8_t,
490 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
491 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
492 }
493
yaksuri_cudai_kernel_unpack_contig_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)494 __global__ void yaksuri_cudai_kernel_unpack_contig_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
495 {
496 const char *__restrict__ sbuf = (const char *) inbuf;
497 char *__restrict__ dbuf = (char *) outbuf;
498 uintptr_t extent = md->extent;
499 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
500 uintptr_t res = idx;
501 uintptr_t inner_elements = md->num_elements;
502
503 if (idx >= (count * inner_elements))
504 return;
505
506 uintptr_t x0 = res / inner_elements;
507 res %= inner_elements;
508 inner_elements /= md->u.contig.count;
509
510 uintptr_t x1 = res / inner_elements;
511 res %= inner_elements;
512 inner_elements /= md->u.contig.child->u.resized.child->u.hindexed.count;
513
514 uintptr_t x2;
515 for (int i = 0; i < md->u.contig.child->u.resized.child->u.hindexed.count; i++) {
516 uintptr_t in_elems = md->u.contig.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
517 md->u.contig.child->u.resized.child->u.hindexed.child->num_elements;
518 if (res < in_elems) {
519 x2 = i;
520 res %= in_elems;
521 inner_elements = md->u.contig.child->u.resized.child->u.hindexed.child->num_elements;
522 break;
523 } else {
524 res -= in_elems;
525 }
526 }
527
528 uintptr_t x3 = res;
529
530 intptr_t stride1 = md->u.contig.child->extent;
531 intptr_t *array_of_displs3 = md->u.contig.child->u.resized.child->u.hindexed.array_of_displs;
532 *((int8_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs3[x2] + x3 * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + idx * sizeof(int8_t)));
533 }
534
yaksuri_cudai_unpack_contig_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)535 void yaksuri_cudai_unpack_contig_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
536 {
537 void *args[] = { &inbuf, &outbuf, &count, &md };
538 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_resized_hindexed_int8_t,
539 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
540 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
541 }
542
yaksuri_cudai_kernel_pack_resized_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)543 __global__ void yaksuri_cudai_kernel_pack_resized_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
544 {
545 const char *__restrict__ sbuf = (const char *) inbuf;
546 char *__restrict__ dbuf = (char *) outbuf;
547 uintptr_t extent = md->extent;
548 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
549 uintptr_t res = idx;
550 uintptr_t inner_elements = md->num_elements;
551
552 if (idx >= (count * inner_elements))
553 return;
554
555 uintptr_t x0 = res / inner_elements;
556 res %= inner_elements;
557 inner_elements /= md->u.resized.child->u.resized.child->u.hindexed.count;
558
559 uintptr_t x1;
560 for (int i = 0; i < md->u.resized.child->u.resized.child->u.hindexed.count; i++) {
561 uintptr_t in_elems = md->u.resized.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
562 md->u.resized.child->u.resized.child->u.hindexed.child->num_elements;
563 if (res < in_elems) {
564 x1 = i;
565 res %= in_elems;
566 inner_elements = md->u.resized.child->u.resized.child->u.hindexed.child->num_elements;
567 break;
568 } else {
569 res -= in_elems;
570 }
571 }
572
573 uintptr_t x2 = res;
574
575 intptr_t *array_of_displs3 = md->u.resized.child->u.resized.child->u.hindexed.array_of_displs;
576 *((int8_t *) (void *) (dbuf + idx * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + x0 * extent + array_of_displs3[x1] + x2 * sizeof(int8_t)));
577 }
578
yaksuri_cudai_pack_resized_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)579 void yaksuri_cudai_pack_resized_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
580 {
581 void *args[] = { &inbuf, &outbuf, &count, &md };
582 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_resized_hindexed_int8_t,
583 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
584 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
585 }
586
yaksuri_cudai_kernel_unpack_resized_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)587 __global__ void yaksuri_cudai_kernel_unpack_resized_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
588 {
589 const char *__restrict__ sbuf = (const char *) inbuf;
590 char *__restrict__ dbuf = (char *) outbuf;
591 uintptr_t extent = md->extent;
592 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
593 uintptr_t res = idx;
594 uintptr_t inner_elements = md->num_elements;
595
596 if (idx >= (count * inner_elements))
597 return;
598
599 uintptr_t x0 = res / inner_elements;
600 res %= inner_elements;
601 inner_elements /= md->u.resized.child->u.resized.child->u.hindexed.count;
602
603 uintptr_t x1;
604 for (int i = 0; i < md->u.resized.child->u.resized.child->u.hindexed.count; i++) {
605 uintptr_t in_elems = md->u.resized.child->u.resized.child->u.hindexed.array_of_blocklengths[i] *
606 md->u.resized.child->u.resized.child->u.hindexed.child->num_elements;
607 if (res < in_elems) {
608 x1 = i;
609 res %= in_elems;
610 inner_elements = md->u.resized.child->u.resized.child->u.hindexed.child->num_elements;
611 break;
612 } else {
613 res -= in_elems;
614 }
615 }
616
617 uintptr_t x2 = res;
618
619 intptr_t *array_of_displs3 = md->u.resized.child->u.resized.child->u.hindexed.array_of_displs;
620 *((int8_t *) (void *) (dbuf + x0 * extent + array_of_displs3[x1] + x2 * sizeof(int8_t))) = *((const int8_t *) (const void *) (sbuf + idx * sizeof(int8_t)));
621 }
622
yaksuri_cudai_unpack_resized_resized_hindexed_int8_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)623 void yaksuri_cudai_unpack_resized_resized_hindexed_int8_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
624 {
625 void *args[] = { &inbuf, &outbuf, &count, &md };
626 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_resized_hindexed_int8_t,
627 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
628 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
629 }
630
631