1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.hindexed.count;
32
33 uintptr_t x1;
34 for (int i = 0; i < md->u.hindexed.count; i++) {
35 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
36 md->u.hindexed.child->num_elements;
37 if (res < in_elems) {
38 x1 = i;
39 res %= in_elems;
40 inner_elements = md->u.hindexed.child->num_elements;
41 break;
42 } else {
43 res -= in_elems;
44 }
45 }
46
47 uintptr_t x2 = res / inner_elements;
48 res %= inner_elements;
49 inner_elements /= md->u.hindexed.child->u.hindexed.count;
50
51 uintptr_t x3;
52 for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
53 uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
54 md->u.hindexed.child->u.hindexed.child->num_elements;
55 if (res < in_elems) {
56 x3 = i;
57 res %= in_elems;
58 inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
59 break;
60 } else {
61 res -= in_elems;
62 }
63 }
64
65 uintptr_t x4 = res;
66
67 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
68 intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
69 uintptr_t extent2 = md->u.hindexed.child->extent;
70 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(int64_t)));
71 }
72
yaksuri_cudai_pack_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)73 void yaksuri_cudai_pack_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
74 {
75 void *args[] = { &inbuf, &outbuf, &count, &md };
76 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_hindexed_int64_t,
77 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
78 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
79 }
80
yaksuri_cudai_kernel_unpack_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)81 __global__ void yaksuri_cudai_kernel_unpack_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
82 {
83 const char *__restrict__ sbuf = (const char *) inbuf;
84 char *__restrict__ dbuf = (char *) outbuf;
85 uintptr_t extent = md->extent;
86 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
87 uintptr_t res = idx;
88 uintptr_t inner_elements = md->num_elements;
89
90 if (idx >= (count * inner_elements))
91 return;
92
93 uintptr_t x0 = res / inner_elements;
94 res %= inner_elements;
95 inner_elements /= md->u.hindexed.count;
96
97 uintptr_t x1;
98 for (int i = 0; i < md->u.hindexed.count; i++) {
99 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
100 md->u.hindexed.child->num_elements;
101 if (res < in_elems) {
102 x1 = i;
103 res %= in_elems;
104 inner_elements = md->u.hindexed.child->num_elements;
105 break;
106 } else {
107 res -= in_elems;
108 }
109 }
110
111 uintptr_t x2 = res / inner_elements;
112 res %= inner_elements;
113 inner_elements /= md->u.hindexed.child->u.hindexed.count;
114
115 uintptr_t x3;
116 for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
117 uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
118 md->u.hindexed.child->u.hindexed.child->num_elements;
119 if (res < in_elems) {
120 x3 = i;
121 res %= in_elems;
122 inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
123 break;
124 } else {
125 res -= in_elems;
126 }
127 }
128
129 uintptr_t x4 = res;
130
131 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
132 intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
133 uintptr_t extent2 = md->u.hindexed.child->extent;
134 *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
135 }
136
yaksuri_cudai_unpack_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)137 void yaksuri_cudai_unpack_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
138 {
139 void *args[] = { &inbuf, &outbuf, &count, &md };
140 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_hindexed_int64_t,
141 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
142 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
143 }
144
yaksuri_cudai_kernel_pack_hvector_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)145 __global__ void yaksuri_cudai_kernel_pack_hvector_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
146 {
147 const char *__restrict__ sbuf = (const char *) inbuf;
148 char *__restrict__ dbuf = (char *) outbuf;
149 uintptr_t extent = md->extent;
150 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
151 uintptr_t res = idx;
152 uintptr_t inner_elements = md->num_elements;
153
154 if (idx >= (count * inner_elements))
155 return;
156
157 uintptr_t x0 = res / inner_elements;
158 res %= inner_elements;
159 inner_elements /= md->u.hvector.count;
160
161 uintptr_t x1 = res / inner_elements;
162 res %= inner_elements;
163 inner_elements /= md->u.hvector.blocklength;
164 uintptr_t x2 = res / inner_elements;
165 res %= inner_elements;
166 inner_elements /= md->u.hvector.child->u.hindexed.count;
167
168 uintptr_t x3;
169 for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
170 uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
171 md->u.hvector.child->u.hindexed.child->num_elements;
172 if (res < in_elems) {
173 x3 = i;
174 res %= in_elems;
175 inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
176 break;
177 } else {
178 res -= in_elems;
179 }
180 }
181
182 uintptr_t x4 = res / inner_elements;
183 res %= inner_elements;
184 inner_elements /= md->u.hvector.child->u.hindexed.child->u.hindexed.count;
185
186 uintptr_t x5;
187 for (int i = 0; i < md->u.hvector.child->u.hindexed.child->u.hindexed.count; i++) {
188 uintptr_t in_elems = md->u.hvector.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
189 md->u.hvector.child->u.hindexed.child->u.hindexed.child->num_elements;
190 if (res < in_elems) {
191 x5 = i;
192 res %= in_elems;
193 inner_elements = md->u.hvector.child->u.hindexed.child->u.hindexed.child->num_elements;
194 break;
195 } else {
196 res -= in_elems;
197 }
198 }
199
200 uintptr_t x6 = res;
201
202 intptr_t stride1 = md->u.hvector.stride;
203 intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
204 uintptr_t extent2 = md->u.hvector.child->extent;
205 intptr_t *array_of_displs3 = md->u.hvector.child->u.hindexed.child->u.hindexed.array_of_displs;
206 uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
207 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int64_t)));
208 }
209
yaksuri_cudai_pack_hvector_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)210 void yaksuri_cudai_pack_hvector_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
211 {
212 void *args[] = { &inbuf, &outbuf, &count, &md };
213 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_hindexed_hindexed_int64_t,
214 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
215 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
216 }
217
yaksuri_cudai_kernel_unpack_hvector_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)218 __global__ void yaksuri_cudai_kernel_unpack_hvector_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
219 {
220 const char *__restrict__ sbuf = (const char *) inbuf;
221 char *__restrict__ dbuf = (char *) outbuf;
222 uintptr_t extent = md->extent;
223 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
224 uintptr_t res = idx;
225 uintptr_t inner_elements = md->num_elements;
226
227 if (idx >= (count * inner_elements))
228 return;
229
230 uintptr_t x0 = res / inner_elements;
231 res %= inner_elements;
232 inner_elements /= md->u.hvector.count;
233
234 uintptr_t x1 = res / inner_elements;
235 res %= inner_elements;
236 inner_elements /= md->u.hvector.blocklength;
237 uintptr_t x2 = res / inner_elements;
238 res %= inner_elements;
239 inner_elements /= md->u.hvector.child->u.hindexed.count;
240
241 uintptr_t x3;
242 for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
243 uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
244 md->u.hvector.child->u.hindexed.child->num_elements;
245 if (res < in_elems) {
246 x3 = i;
247 res %= in_elems;
248 inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
249 break;
250 } else {
251 res -= in_elems;
252 }
253 }
254
255 uintptr_t x4 = res / inner_elements;
256 res %= inner_elements;
257 inner_elements /= md->u.hvector.child->u.hindexed.child->u.hindexed.count;
258
259 uintptr_t x5;
260 for (int i = 0; i < md->u.hvector.child->u.hindexed.child->u.hindexed.count; i++) {
261 uintptr_t in_elems = md->u.hvector.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
262 md->u.hvector.child->u.hindexed.child->u.hindexed.child->num_elements;
263 if (res < in_elems) {
264 x5 = i;
265 res %= in_elems;
266 inner_elements = md->u.hvector.child->u.hindexed.child->u.hindexed.child->num_elements;
267 break;
268 } else {
269 res -= in_elems;
270 }
271 }
272
273 uintptr_t x6 = res;
274
275 intptr_t stride1 = md->u.hvector.stride;
276 intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
277 uintptr_t extent2 = md->u.hvector.child->extent;
278 intptr_t *array_of_displs3 = md->u.hvector.child->u.hindexed.child->u.hindexed.array_of_displs;
279 uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
280 *((int64_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
281 }
282
yaksuri_cudai_unpack_hvector_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)283 void yaksuri_cudai_unpack_hvector_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
284 {
285 void *args[] = { &inbuf, &outbuf, &count, &md };
286 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_hindexed_hindexed_int64_t,
287 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
288 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
289 }
290
yaksuri_cudai_kernel_pack_blkhindx_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)291 __global__ void yaksuri_cudai_kernel_pack_blkhindx_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
292 {
293 const char *__restrict__ sbuf = (const char *) inbuf;
294 char *__restrict__ dbuf = (char *) outbuf;
295 uintptr_t extent = md->extent;
296 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
297 uintptr_t res = idx;
298 uintptr_t inner_elements = md->num_elements;
299
300 if (idx >= (count * inner_elements))
301 return;
302
303 uintptr_t x0 = res / inner_elements;
304 res %= inner_elements;
305 inner_elements /= md->u.blkhindx.count;
306
307 uintptr_t x1 = res / inner_elements;
308 res %= inner_elements;
309 inner_elements /= md->u.blkhindx.blocklength;
310 uintptr_t x2 = res / inner_elements;
311 res %= inner_elements;
312 inner_elements /= md->u.blkhindx.child->u.hindexed.count;
313
314 uintptr_t x3;
315 for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
316 uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
317 md->u.blkhindx.child->u.hindexed.child->num_elements;
318 if (res < in_elems) {
319 x3 = i;
320 res %= in_elems;
321 inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
322 break;
323 } else {
324 res -= in_elems;
325 }
326 }
327
328 uintptr_t x4 = res / inner_elements;
329 res %= inner_elements;
330 inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.hindexed.count;
331
332 uintptr_t x5;
333 for (int i = 0; i < md->u.blkhindx.child->u.hindexed.child->u.hindexed.count; i++) {
334 uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
335 md->u.blkhindx.child->u.hindexed.child->u.hindexed.child->num_elements;
336 if (res < in_elems) {
337 x5 = i;
338 res %= in_elems;
339 inner_elements = md->u.blkhindx.child->u.hindexed.child->u.hindexed.child->num_elements;
340 break;
341 } else {
342 res -= in_elems;
343 }
344 }
345
346 uintptr_t x6 = res;
347
348 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
349 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
350 uintptr_t extent2 = md->u.blkhindx.child->extent;
351 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.hindexed.child->u.hindexed.array_of_displs;
352 uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
353 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int64_t)));
354 }
355
yaksuri_cudai_pack_blkhindx_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)356 void yaksuri_cudai_pack_blkhindx_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
357 {
358 void *args[] = { &inbuf, &outbuf, &count, &md };
359 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_hindexed_hindexed_int64_t,
360 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
361 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
362 }
363
yaksuri_cudai_kernel_unpack_blkhindx_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)364 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
365 {
366 const char *__restrict__ sbuf = (const char *) inbuf;
367 char *__restrict__ dbuf = (char *) outbuf;
368 uintptr_t extent = md->extent;
369 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
370 uintptr_t res = idx;
371 uintptr_t inner_elements = md->num_elements;
372
373 if (idx >= (count * inner_elements))
374 return;
375
376 uintptr_t x0 = res / inner_elements;
377 res %= inner_elements;
378 inner_elements /= md->u.blkhindx.count;
379
380 uintptr_t x1 = res / inner_elements;
381 res %= inner_elements;
382 inner_elements /= md->u.blkhindx.blocklength;
383 uintptr_t x2 = res / inner_elements;
384 res %= inner_elements;
385 inner_elements /= md->u.blkhindx.child->u.hindexed.count;
386
387 uintptr_t x3;
388 for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
389 uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
390 md->u.blkhindx.child->u.hindexed.child->num_elements;
391 if (res < in_elems) {
392 x3 = i;
393 res %= in_elems;
394 inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
395 break;
396 } else {
397 res -= in_elems;
398 }
399 }
400
401 uintptr_t x4 = res / inner_elements;
402 res %= inner_elements;
403 inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.hindexed.count;
404
405 uintptr_t x5;
406 for (int i = 0; i < md->u.blkhindx.child->u.hindexed.child->u.hindexed.count; i++) {
407 uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
408 md->u.blkhindx.child->u.hindexed.child->u.hindexed.child->num_elements;
409 if (res < in_elems) {
410 x5 = i;
411 res %= in_elems;
412 inner_elements = md->u.blkhindx.child->u.hindexed.child->u.hindexed.child->num_elements;
413 break;
414 } else {
415 res -= in_elems;
416 }
417 }
418
419 uintptr_t x6 = res;
420
421 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
422 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
423 uintptr_t extent2 = md->u.blkhindx.child->extent;
424 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.hindexed.child->u.hindexed.array_of_displs;
425 uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
426 *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
427 }
428
yaksuri_cudai_unpack_blkhindx_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)429 void yaksuri_cudai_unpack_blkhindx_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
430 {
431 void *args[] = { &inbuf, &outbuf, &count, &md };
432 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_hindexed_hindexed_int64_t,
433 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
434 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
435 }
436
yaksuri_cudai_kernel_pack_hindexed_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)437 __global__ void yaksuri_cudai_kernel_pack_hindexed_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
438 {
439 const char *__restrict__ sbuf = (const char *) inbuf;
440 char *__restrict__ dbuf = (char *) outbuf;
441 uintptr_t extent = md->extent;
442 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
443 uintptr_t res = idx;
444 uintptr_t inner_elements = md->num_elements;
445
446 if (idx >= (count * inner_elements))
447 return;
448
449 uintptr_t x0 = res / inner_elements;
450 res %= inner_elements;
451 inner_elements /= md->u.hindexed.count;
452
453 uintptr_t x1;
454 for (int i = 0; i < md->u.hindexed.count; i++) {
455 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
456 md->u.hindexed.child->num_elements;
457 if (res < in_elems) {
458 x1 = i;
459 res %= in_elems;
460 inner_elements = md->u.hindexed.child->num_elements;
461 break;
462 } else {
463 res -= in_elems;
464 }
465 }
466
467 uintptr_t x2 = res / inner_elements;
468 res %= inner_elements;
469 inner_elements /= md->u.hindexed.child->u.hindexed.count;
470
471 uintptr_t x3;
472 for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
473 uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
474 md->u.hindexed.child->u.hindexed.child->num_elements;
475 if (res < in_elems) {
476 x3 = i;
477 res %= in_elems;
478 inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
479 break;
480 } else {
481 res -= in_elems;
482 }
483 }
484
485 uintptr_t x4 = res / inner_elements;
486 res %= inner_elements;
487 inner_elements /= md->u.hindexed.child->u.hindexed.child->u.hindexed.count;
488
489 uintptr_t x5;
490 for (int i = 0; i < md->u.hindexed.child->u.hindexed.child->u.hindexed.count; i++) {
491 uintptr_t in_elems = md->u.hindexed.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
492 md->u.hindexed.child->u.hindexed.child->u.hindexed.child->num_elements;
493 if (res < in_elems) {
494 x5 = i;
495 res %= in_elems;
496 inner_elements = md->u.hindexed.child->u.hindexed.child->u.hindexed.child->num_elements;
497 break;
498 } else {
499 res -= in_elems;
500 }
501 }
502
503 uintptr_t x6 = res;
504
505 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
506 intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
507 uintptr_t extent2 = md->u.hindexed.child->extent;
508 intptr_t *array_of_displs3 = md->u.hindexed.child->u.hindexed.child->u.hindexed.array_of_displs;
509 uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
510 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int64_t)));
511 }
512
yaksuri_cudai_pack_hindexed_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)513 void yaksuri_cudai_pack_hindexed_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
514 {
515 void *args[] = { &inbuf, &outbuf, &count, &md };
516 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_hindexed_hindexed_int64_t,
517 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
518 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
519 }
520
yaksuri_cudai_kernel_unpack_hindexed_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)521 __global__ void yaksuri_cudai_kernel_unpack_hindexed_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
522 {
523 const char *__restrict__ sbuf = (const char *) inbuf;
524 char *__restrict__ dbuf = (char *) outbuf;
525 uintptr_t extent = md->extent;
526 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
527 uintptr_t res = idx;
528 uintptr_t inner_elements = md->num_elements;
529
530 if (idx >= (count * inner_elements))
531 return;
532
533 uintptr_t x0 = res / inner_elements;
534 res %= inner_elements;
535 inner_elements /= md->u.hindexed.count;
536
537 uintptr_t x1;
538 for (int i = 0; i < md->u.hindexed.count; i++) {
539 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
540 md->u.hindexed.child->num_elements;
541 if (res < in_elems) {
542 x1 = i;
543 res %= in_elems;
544 inner_elements = md->u.hindexed.child->num_elements;
545 break;
546 } else {
547 res -= in_elems;
548 }
549 }
550
551 uintptr_t x2 = res / inner_elements;
552 res %= inner_elements;
553 inner_elements /= md->u.hindexed.child->u.hindexed.count;
554
555 uintptr_t x3;
556 for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
557 uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
558 md->u.hindexed.child->u.hindexed.child->num_elements;
559 if (res < in_elems) {
560 x3 = i;
561 res %= in_elems;
562 inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
563 break;
564 } else {
565 res -= in_elems;
566 }
567 }
568
569 uintptr_t x4 = res / inner_elements;
570 res %= inner_elements;
571 inner_elements /= md->u.hindexed.child->u.hindexed.child->u.hindexed.count;
572
573 uintptr_t x5;
574 for (int i = 0; i < md->u.hindexed.child->u.hindexed.child->u.hindexed.count; i++) {
575 uintptr_t in_elems = md->u.hindexed.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
576 md->u.hindexed.child->u.hindexed.child->u.hindexed.child->num_elements;
577 if (res < in_elems) {
578 x5 = i;
579 res %= in_elems;
580 inner_elements = md->u.hindexed.child->u.hindexed.child->u.hindexed.child->num_elements;
581 break;
582 } else {
583 res -= in_elems;
584 }
585 }
586
587 uintptr_t x6 = res;
588
589 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
590 intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
591 uintptr_t extent2 = md->u.hindexed.child->extent;
592 intptr_t *array_of_displs3 = md->u.hindexed.child->u.hindexed.child->u.hindexed.array_of_displs;
593 uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
594 *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
595 }
596
yaksuri_cudai_unpack_hindexed_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)597 void yaksuri_cudai_unpack_hindexed_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
598 {
599 void *args[] = { &inbuf, &outbuf, &count, &md };
600 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_hindexed_hindexed_int64_t,
601 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
602 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
603 }
604
yaksuri_cudai_kernel_pack_contig_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)605 __global__ void yaksuri_cudai_kernel_pack_contig_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
606 {
607 const char *__restrict__ sbuf = (const char *) inbuf;
608 char *__restrict__ dbuf = (char *) outbuf;
609 uintptr_t extent = md->extent;
610 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
611 uintptr_t res = idx;
612 uintptr_t inner_elements = md->num_elements;
613
614 if (idx >= (count * inner_elements))
615 return;
616
617 uintptr_t x0 = res / inner_elements;
618 res %= inner_elements;
619 inner_elements /= md->u.contig.count;
620
621 uintptr_t x1 = res / inner_elements;
622 res %= inner_elements;
623 inner_elements /= md->u.contig.child->u.hindexed.count;
624
625 uintptr_t x2;
626 for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
627 uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
628 md->u.contig.child->u.hindexed.child->num_elements;
629 if (res < in_elems) {
630 x2 = i;
631 res %= in_elems;
632 inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
633 break;
634 } else {
635 res -= in_elems;
636 }
637 }
638
639 uintptr_t x3 = res / inner_elements;
640 res %= inner_elements;
641 inner_elements /= md->u.contig.child->u.hindexed.child->u.hindexed.count;
642
643 uintptr_t x4;
644 for (int i = 0; i < md->u.contig.child->u.hindexed.child->u.hindexed.count; i++) {
645 uintptr_t in_elems = md->u.contig.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
646 md->u.contig.child->u.hindexed.child->u.hindexed.child->num_elements;
647 if (res < in_elems) {
648 x4 = i;
649 res %= in_elems;
650 inner_elements = md->u.contig.child->u.hindexed.child->u.hindexed.child->num_elements;
651 break;
652 } else {
653 res -= in_elems;
654 }
655 }
656
657 uintptr_t x5 = res;
658
659 intptr_t stride1 = md->u.contig.child->extent;
660 intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
661 intptr_t *array_of_displs3 = md->u.contig.child->u.hindexed.child->u.hindexed.array_of_displs;
662 uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
663 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(int64_t)));
664 }
665
yaksuri_cudai_pack_contig_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)666 void yaksuri_cudai_pack_contig_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
667 {
668 void *args[] = { &inbuf, &outbuf, &count, &md };
669 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_hindexed_hindexed_int64_t,
670 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
671 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
672 }
673
yaksuri_cudai_kernel_unpack_contig_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)674 __global__ void yaksuri_cudai_kernel_unpack_contig_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
675 {
676 const char *__restrict__ sbuf = (const char *) inbuf;
677 char *__restrict__ dbuf = (char *) outbuf;
678 uintptr_t extent = md->extent;
679 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
680 uintptr_t res = idx;
681 uintptr_t inner_elements = md->num_elements;
682
683 if (idx >= (count * inner_elements))
684 return;
685
686 uintptr_t x0 = res / inner_elements;
687 res %= inner_elements;
688 inner_elements /= md->u.contig.count;
689
690 uintptr_t x1 = res / inner_elements;
691 res %= inner_elements;
692 inner_elements /= md->u.contig.child->u.hindexed.count;
693
694 uintptr_t x2;
695 for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
696 uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
697 md->u.contig.child->u.hindexed.child->num_elements;
698 if (res < in_elems) {
699 x2 = i;
700 res %= in_elems;
701 inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
702 break;
703 } else {
704 res -= in_elems;
705 }
706 }
707
708 uintptr_t x3 = res / inner_elements;
709 res %= inner_elements;
710 inner_elements /= md->u.contig.child->u.hindexed.child->u.hindexed.count;
711
712 uintptr_t x4;
713 for (int i = 0; i < md->u.contig.child->u.hindexed.child->u.hindexed.count; i++) {
714 uintptr_t in_elems = md->u.contig.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
715 md->u.contig.child->u.hindexed.child->u.hindexed.child->num_elements;
716 if (res < in_elems) {
717 x4 = i;
718 res %= in_elems;
719 inner_elements = md->u.contig.child->u.hindexed.child->u.hindexed.child->num_elements;
720 break;
721 } else {
722 res -= in_elems;
723 }
724 }
725
726 uintptr_t x5 = res;
727
728 intptr_t stride1 = md->u.contig.child->extent;
729 intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
730 intptr_t *array_of_displs3 = md->u.contig.child->u.hindexed.child->u.hindexed.array_of_displs;
731 uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
732 *((int64_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
733 }
734
yaksuri_cudai_unpack_contig_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)735 void yaksuri_cudai_unpack_contig_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
736 {
737 void *args[] = { &inbuf, &outbuf, &count, &md };
738 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_hindexed_hindexed_int64_t,
739 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
740 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
741 }
742
yaksuri_cudai_kernel_pack_resized_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)743 __global__ void yaksuri_cudai_kernel_pack_resized_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
744 {
745 const char *__restrict__ sbuf = (const char *) inbuf;
746 char *__restrict__ dbuf = (char *) outbuf;
747 uintptr_t extent = md->extent;
748 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
749 uintptr_t res = idx;
750 uintptr_t inner_elements = md->num_elements;
751
752 if (idx >= (count * inner_elements))
753 return;
754
755 uintptr_t x0 = res / inner_elements;
756 res %= inner_elements;
757 inner_elements /= md->u.resized.child->u.hindexed.count;
758
759 uintptr_t x1;
760 for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
761 uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
762 md->u.resized.child->u.hindexed.child->num_elements;
763 if (res < in_elems) {
764 x1 = i;
765 res %= in_elems;
766 inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
767 break;
768 } else {
769 res -= in_elems;
770 }
771 }
772
773 uintptr_t x2 = res / inner_elements;
774 res %= inner_elements;
775 inner_elements /= md->u.resized.child->u.hindexed.child->u.hindexed.count;
776
777 uintptr_t x3;
778 for (int i = 0; i < md->u.resized.child->u.hindexed.child->u.hindexed.count; i++) {
779 uintptr_t in_elems = md->u.resized.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
780 md->u.resized.child->u.hindexed.child->u.hindexed.child->num_elements;
781 if (res < in_elems) {
782 x3 = i;
783 res %= in_elems;
784 inner_elements = md->u.resized.child->u.hindexed.child->u.hindexed.child->num_elements;
785 break;
786 } else {
787 res -= in_elems;
788 }
789 }
790
791 uintptr_t x4 = res;
792
793 intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
794 intptr_t *array_of_displs3 = md->u.resized.child->u.hindexed.child->u.hindexed.array_of_displs;
795 uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
796 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(int64_t)));
797 }
798
yaksuri_cudai_pack_resized_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)799 void yaksuri_cudai_pack_resized_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
800 {
801 void *args[] = { &inbuf, &outbuf, &count, &md };
802 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_hindexed_hindexed_int64_t,
803 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
804 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
805 }
806
yaksuri_cudai_kernel_unpack_resized_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)807 __global__ void yaksuri_cudai_kernel_unpack_resized_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
808 {
809 const char *__restrict__ sbuf = (const char *) inbuf;
810 char *__restrict__ dbuf = (char *) outbuf;
811 uintptr_t extent = md->extent;
812 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
813 uintptr_t res = idx;
814 uintptr_t inner_elements = md->num_elements;
815
816 if (idx >= (count * inner_elements))
817 return;
818
819 uintptr_t x0 = res / inner_elements;
820 res %= inner_elements;
821 inner_elements /= md->u.resized.child->u.hindexed.count;
822
823 uintptr_t x1;
824 for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
825 uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
826 md->u.resized.child->u.hindexed.child->num_elements;
827 if (res < in_elems) {
828 x1 = i;
829 res %= in_elems;
830 inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
831 break;
832 } else {
833 res -= in_elems;
834 }
835 }
836
837 uintptr_t x2 = res / inner_elements;
838 res %= inner_elements;
839 inner_elements /= md->u.resized.child->u.hindexed.child->u.hindexed.count;
840
841 uintptr_t x3;
842 for (int i = 0; i < md->u.resized.child->u.hindexed.child->u.hindexed.count; i++) {
843 uintptr_t in_elems = md->u.resized.child->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
844 md->u.resized.child->u.hindexed.child->u.hindexed.child->num_elements;
845 if (res < in_elems) {
846 x3 = i;
847 res %= in_elems;
848 inner_elements = md->u.resized.child->u.hindexed.child->u.hindexed.child->num_elements;
849 break;
850 } else {
851 res -= in_elems;
852 }
853 }
854
855 uintptr_t x4 = res;
856
857 intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
858 intptr_t *array_of_displs3 = md->u.resized.child->u.hindexed.child->u.hindexed.array_of_displs;
859 uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
860 *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
861 }
862
yaksuri_cudai_unpack_resized_hindexed_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)863 void yaksuri_cudai_unpack_resized_hindexed_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
864 {
865 void *args[] = { &inbuf, &outbuf, &count, &md };
866 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_hindexed_hindexed_int64_t,
867 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
868 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
869 }
870
871