1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.blkhindx.count;
32
33 uintptr_t x1 = res / inner_elements;
34 res %= inner_elements;
35 inner_elements /= md->u.blkhindx.blocklength;
36 uintptr_t x2 = res / inner_elements;
37 res %= inner_elements;
38 inner_elements /= md->u.blkhindx.child->u.hindexed.count;
39
40 uintptr_t x3;
41 for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
42 uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
43 md->u.blkhindx.child->u.hindexed.child->num_elements;
44 if (res < in_elems) {
45 x3 = i;
46 res %= in_elems;
47 inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
48 break;
49 } else {
50 res -= in_elems;
51 }
52 }
53
54 uintptr_t x4 = res;
55
56 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
57 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
58 uintptr_t extent2 = md->u.blkhindx.child->extent;
59 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(int64_t)));
60 }
61
yaksuri_cudai_pack_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)62 void yaksuri_cudai_pack_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
63 {
64 void *args[] = { &inbuf, &outbuf, &count, &md };
65 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_hindexed_int64_t,
66 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
67 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
68 }
69
yaksuri_cudai_kernel_unpack_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)70 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
71 {
72 const char *__restrict__ sbuf = (const char *) inbuf;
73 char *__restrict__ dbuf = (char *) outbuf;
74 uintptr_t extent = md->extent;
75 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
76 uintptr_t res = idx;
77 uintptr_t inner_elements = md->num_elements;
78
79 if (idx >= (count * inner_elements))
80 return;
81
82 uintptr_t x0 = res / inner_elements;
83 res %= inner_elements;
84 inner_elements /= md->u.blkhindx.count;
85
86 uintptr_t x1 = res / inner_elements;
87 res %= inner_elements;
88 inner_elements /= md->u.blkhindx.blocklength;
89 uintptr_t x2 = res / inner_elements;
90 res %= inner_elements;
91 inner_elements /= md->u.blkhindx.child->u.hindexed.count;
92
93 uintptr_t x3;
94 for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
95 uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
96 md->u.blkhindx.child->u.hindexed.child->num_elements;
97 if (res < in_elems) {
98 x3 = i;
99 res %= in_elems;
100 inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
101 break;
102 } else {
103 res -= in_elems;
104 }
105 }
106
107 uintptr_t x4 = res;
108
109 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
110 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
111 uintptr_t extent2 = md->u.blkhindx.child->extent;
112 *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
113 }
114
yaksuri_cudai_unpack_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)115 void yaksuri_cudai_unpack_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
116 {
117 void *args[] = { &inbuf, &outbuf, &count, &md };
118 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_hindexed_int64_t,
119 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
120 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
121 }
122
yaksuri_cudai_kernel_pack_hvector_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)123 __global__ void yaksuri_cudai_kernel_pack_hvector_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
124 {
125 const char *__restrict__ sbuf = (const char *) inbuf;
126 char *__restrict__ dbuf = (char *) outbuf;
127 uintptr_t extent = md->extent;
128 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
129 uintptr_t res = idx;
130 uintptr_t inner_elements = md->num_elements;
131
132 if (idx >= (count * inner_elements))
133 return;
134
135 uintptr_t x0 = res / inner_elements;
136 res %= inner_elements;
137 inner_elements /= md->u.hvector.count;
138
139 uintptr_t x1 = res / inner_elements;
140 res %= inner_elements;
141 inner_elements /= md->u.hvector.blocklength;
142 uintptr_t x2 = res / inner_elements;
143 res %= inner_elements;
144 inner_elements /= md->u.hvector.child->u.blkhindx.count;
145
146 uintptr_t x3 = res / inner_elements;
147 res %= inner_elements;
148 inner_elements /= md->u.hvector.child->u.blkhindx.blocklength;
149 uintptr_t x4 = res / inner_elements;
150 res %= inner_elements;
151 inner_elements /= md->u.hvector.child->u.blkhindx.child->u.hindexed.count;
152
153 uintptr_t x5;
154 for (int i = 0; i < md->u.hvector.child->u.blkhindx.child->u.hindexed.count; i++) {
155 uintptr_t in_elems = md->u.hvector.child->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
156 md->u.hvector.child->u.blkhindx.child->u.hindexed.child->num_elements;
157 if (res < in_elems) {
158 x5 = i;
159 res %= in_elems;
160 inner_elements = md->u.hvector.child->u.blkhindx.child->u.hindexed.child->num_elements;
161 break;
162 } else {
163 res -= in_elems;
164 }
165 }
166
167 uintptr_t x6 = res;
168
169 intptr_t stride1 = md->u.hvector.stride;
170 intptr_t *array_of_displs2 = md->u.hvector.child->u.blkhindx.array_of_displs;
171 uintptr_t extent2 = md->u.hvector.child->extent;
172 intptr_t *array_of_displs3 = md->u.hvector.child->u.blkhindx.child->u.hindexed.array_of_displs;
173 uintptr_t extent3 = md->u.hvector.child->u.blkhindx.child->extent;
174 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int64_t)));
175 }
176
yaksuri_cudai_pack_hvector_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)177 void yaksuri_cudai_pack_hvector_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
178 {
179 void *args[] = { &inbuf, &outbuf, &count, &md };
180 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_blkhindx_hindexed_int64_t,
181 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
182 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
183 }
184
yaksuri_cudai_kernel_unpack_hvector_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)185 __global__ void yaksuri_cudai_kernel_unpack_hvector_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
186 {
187 const char *__restrict__ sbuf = (const char *) inbuf;
188 char *__restrict__ dbuf = (char *) outbuf;
189 uintptr_t extent = md->extent;
190 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
191 uintptr_t res = idx;
192 uintptr_t inner_elements = md->num_elements;
193
194 if (idx >= (count * inner_elements))
195 return;
196
197 uintptr_t x0 = res / inner_elements;
198 res %= inner_elements;
199 inner_elements /= md->u.hvector.count;
200
201 uintptr_t x1 = res / inner_elements;
202 res %= inner_elements;
203 inner_elements /= md->u.hvector.blocklength;
204 uintptr_t x2 = res / inner_elements;
205 res %= inner_elements;
206 inner_elements /= md->u.hvector.child->u.blkhindx.count;
207
208 uintptr_t x3 = res / inner_elements;
209 res %= inner_elements;
210 inner_elements /= md->u.hvector.child->u.blkhindx.blocklength;
211 uintptr_t x4 = res / inner_elements;
212 res %= inner_elements;
213 inner_elements /= md->u.hvector.child->u.blkhindx.child->u.hindexed.count;
214
215 uintptr_t x5;
216 for (int i = 0; i < md->u.hvector.child->u.blkhindx.child->u.hindexed.count; i++) {
217 uintptr_t in_elems = md->u.hvector.child->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
218 md->u.hvector.child->u.blkhindx.child->u.hindexed.child->num_elements;
219 if (res < in_elems) {
220 x5 = i;
221 res %= in_elems;
222 inner_elements = md->u.hvector.child->u.blkhindx.child->u.hindexed.child->num_elements;
223 break;
224 } else {
225 res -= in_elems;
226 }
227 }
228
229 uintptr_t x6 = res;
230
231 intptr_t stride1 = md->u.hvector.stride;
232 intptr_t *array_of_displs2 = md->u.hvector.child->u.blkhindx.array_of_displs;
233 uintptr_t extent2 = md->u.hvector.child->extent;
234 intptr_t *array_of_displs3 = md->u.hvector.child->u.blkhindx.child->u.hindexed.array_of_displs;
235 uintptr_t extent3 = md->u.hvector.child->u.blkhindx.child->extent;
236 *((int64_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
237 }
238
yaksuri_cudai_unpack_hvector_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)239 void yaksuri_cudai_unpack_hvector_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
240 {
241 void *args[] = { &inbuf, &outbuf, &count, &md };
242 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_blkhindx_hindexed_int64_t,
243 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
244 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
245 }
246
yaksuri_cudai_kernel_pack_blkhindx_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)247 __global__ void yaksuri_cudai_kernel_pack_blkhindx_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
248 {
249 const char *__restrict__ sbuf = (const char *) inbuf;
250 char *__restrict__ dbuf = (char *) outbuf;
251 uintptr_t extent = md->extent;
252 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
253 uintptr_t res = idx;
254 uintptr_t inner_elements = md->num_elements;
255
256 if (idx >= (count * inner_elements))
257 return;
258
259 uintptr_t x0 = res / inner_elements;
260 res %= inner_elements;
261 inner_elements /= md->u.blkhindx.count;
262
263 uintptr_t x1 = res / inner_elements;
264 res %= inner_elements;
265 inner_elements /= md->u.blkhindx.blocklength;
266 uintptr_t x2 = res / inner_elements;
267 res %= inner_elements;
268 inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
269
270 uintptr_t x3 = res / inner_elements;
271 res %= inner_elements;
272 inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
273 uintptr_t x4 = res / inner_elements;
274 res %= inner_elements;
275 inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.hindexed.count;
276
277 uintptr_t x5;
278 for (int i = 0; i < md->u.blkhindx.child->u.blkhindx.child->u.hindexed.count; i++) {
279 uintptr_t in_elems = md->u.blkhindx.child->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
280 md->u.blkhindx.child->u.blkhindx.child->u.hindexed.child->num_elements;
281 if (res < in_elems) {
282 x5 = i;
283 res %= in_elems;
284 inner_elements = md->u.blkhindx.child->u.blkhindx.child->u.hindexed.child->num_elements;
285 break;
286 } else {
287 res -= in_elems;
288 }
289 }
290
291 uintptr_t x6 = res;
292
293 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
294 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
295 uintptr_t extent2 = md->u.blkhindx.child->extent;
296 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.blkhindx.child->u.hindexed.array_of_displs;
297 uintptr_t extent3 = md->u.blkhindx.child->u.blkhindx.child->extent;
298 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int64_t)));
299 }
300
yaksuri_cudai_pack_blkhindx_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)301 void yaksuri_cudai_pack_blkhindx_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
302 {
303 void *args[] = { &inbuf, &outbuf, &count, &md };
304 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_blkhindx_hindexed_int64_t,
305 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
306 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
307 }
308
yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)309 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
310 {
311 const char *__restrict__ sbuf = (const char *) inbuf;
312 char *__restrict__ dbuf = (char *) outbuf;
313 uintptr_t extent = md->extent;
314 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
315 uintptr_t res = idx;
316 uintptr_t inner_elements = md->num_elements;
317
318 if (idx >= (count * inner_elements))
319 return;
320
321 uintptr_t x0 = res / inner_elements;
322 res %= inner_elements;
323 inner_elements /= md->u.blkhindx.count;
324
325 uintptr_t x1 = res / inner_elements;
326 res %= inner_elements;
327 inner_elements /= md->u.blkhindx.blocklength;
328 uintptr_t x2 = res / inner_elements;
329 res %= inner_elements;
330 inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
331
332 uintptr_t x3 = res / inner_elements;
333 res %= inner_elements;
334 inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
335 uintptr_t x4 = res / inner_elements;
336 res %= inner_elements;
337 inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.hindexed.count;
338
339 uintptr_t x5;
340 for (int i = 0; i < md->u.blkhindx.child->u.blkhindx.child->u.hindexed.count; i++) {
341 uintptr_t in_elems = md->u.blkhindx.child->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
342 md->u.blkhindx.child->u.blkhindx.child->u.hindexed.child->num_elements;
343 if (res < in_elems) {
344 x5 = i;
345 res %= in_elems;
346 inner_elements = md->u.blkhindx.child->u.blkhindx.child->u.hindexed.child->num_elements;
347 break;
348 } else {
349 res -= in_elems;
350 }
351 }
352
353 uintptr_t x6 = res;
354
355 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
356 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
357 uintptr_t extent2 = md->u.blkhindx.child->extent;
358 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.blkhindx.child->u.hindexed.array_of_displs;
359 uintptr_t extent3 = md->u.blkhindx.child->u.blkhindx.child->extent;
360 *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
361 }
362
yaksuri_cudai_unpack_blkhindx_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)363 void yaksuri_cudai_unpack_blkhindx_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
364 {
365 void *args[] = { &inbuf, &outbuf, &count, &md };
366 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_hindexed_int64_t,
367 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
368 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
369 }
370
yaksuri_cudai_kernel_pack_hindexed_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)371 __global__ void yaksuri_cudai_kernel_pack_hindexed_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
372 {
373 const char *__restrict__ sbuf = (const char *) inbuf;
374 char *__restrict__ dbuf = (char *) outbuf;
375 uintptr_t extent = md->extent;
376 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
377 uintptr_t res = idx;
378 uintptr_t inner_elements = md->num_elements;
379
380 if (idx >= (count * inner_elements))
381 return;
382
383 uintptr_t x0 = res / inner_elements;
384 res %= inner_elements;
385 inner_elements /= md->u.hindexed.count;
386
387 uintptr_t x1;
388 for (int i = 0; i < md->u.hindexed.count; i++) {
389 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
390 md->u.hindexed.child->num_elements;
391 if (res < in_elems) {
392 x1 = i;
393 res %= in_elems;
394 inner_elements = md->u.hindexed.child->num_elements;
395 break;
396 } else {
397 res -= in_elems;
398 }
399 }
400
401 uintptr_t x2 = res / inner_elements;
402 res %= inner_elements;
403 inner_elements /= md->u.hindexed.child->u.blkhindx.count;
404
405 uintptr_t x3 = res / inner_elements;
406 res %= inner_elements;
407 inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
408 uintptr_t x4 = res / inner_elements;
409 res %= inner_elements;
410 inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.hindexed.count;
411
412 uintptr_t x5;
413 for (int i = 0; i < md->u.hindexed.child->u.blkhindx.child->u.hindexed.count; i++) {
414 uintptr_t in_elems = md->u.hindexed.child->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
415 md->u.hindexed.child->u.blkhindx.child->u.hindexed.child->num_elements;
416 if (res < in_elems) {
417 x5 = i;
418 res %= in_elems;
419 inner_elements = md->u.hindexed.child->u.blkhindx.child->u.hindexed.child->num_elements;
420 break;
421 } else {
422 res -= in_elems;
423 }
424 }
425
426 uintptr_t x6 = res;
427
428 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
429 intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
430 uintptr_t extent2 = md->u.hindexed.child->extent;
431 intptr_t *array_of_displs3 = md->u.hindexed.child->u.blkhindx.child->u.hindexed.array_of_displs;
432 uintptr_t extent3 = md->u.hindexed.child->u.blkhindx.child->extent;
433 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int64_t)));
434 }
435
yaksuri_cudai_pack_hindexed_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)436 void yaksuri_cudai_pack_hindexed_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
437 {
438 void *args[] = { &inbuf, &outbuf, &count, &md };
439 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_blkhindx_hindexed_int64_t,
440 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
441 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
442 }
443
yaksuri_cudai_kernel_unpack_hindexed_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)444 __global__ void yaksuri_cudai_kernel_unpack_hindexed_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
445 {
446 const char *__restrict__ sbuf = (const char *) inbuf;
447 char *__restrict__ dbuf = (char *) outbuf;
448 uintptr_t extent = md->extent;
449 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
450 uintptr_t res = idx;
451 uintptr_t inner_elements = md->num_elements;
452
453 if (idx >= (count * inner_elements))
454 return;
455
456 uintptr_t x0 = res / inner_elements;
457 res %= inner_elements;
458 inner_elements /= md->u.hindexed.count;
459
460 uintptr_t x1;
461 for (int i = 0; i < md->u.hindexed.count; i++) {
462 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
463 md->u.hindexed.child->num_elements;
464 if (res < in_elems) {
465 x1 = i;
466 res %= in_elems;
467 inner_elements = md->u.hindexed.child->num_elements;
468 break;
469 } else {
470 res -= in_elems;
471 }
472 }
473
474 uintptr_t x2 = res / inner_elements;
475 res %= inner_elements;
476 inner_elements /= md->u.hindexed.child->u.blkhindx.count;
477
478 uintptr_t x3 = res / inner_elements;
479 res %= inner_elements;
480 inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
481 uintptr_t x4 = res / inner_elements;
482 res %= inner_elements;
483 inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.hindexed.count;
484
485 uintptr_t x5;
486 for (int i = 0; i < md->u.hindexed.child->u.blkhindx.child->u.hindexed.count; i++) {
487 uintptr_t in_elems = md->u.hindexed.child->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
488 md->u.hindexed.child->u.blkhindx.child->u.hindexed.child->num_elements;
489 if (res < in_elems) {
490 x5 = i;
491 res %= in_elems;
492 inner_elements = md->u.hindexed.child->u.blkhindx.child->u.hindexed.child->num_elements;
493 break;
494 } else {
495 res -= in_elems;
496 }
497 }
498
499 uintptr_t x6 = res;
500
501 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
502 intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
503 uintptr_t extent2 = md->u.hindexed.child->extent;
504 intptr_t *array_of_displs3 = md->u.hindexed.child->u.blkhindx.child->u.hindexed.array_of_displs;
505 uintptr_t extent3 = md->u.hindexed.child->u.blkhindx.child->extent;
506 *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
507 }
508
yaksuri_cudai_unpack_hindexed_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)509 void yaksuri_cudai_unpack_hindexed_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
510 {
511 void *args[] = { &inbuf, &outbuf, &count, &md };
512 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_blkhindx_hindexed_int64_t,
513 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
514 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
515 }
516
yaksuri_cudai_kernel_pack_contig_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)517 __global__ void yaksuri_cudai_kernel_pack_contig_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
518 {
519 const char *__restrict__ sbuf = (const char *) inbuf;
520 char *__restrict__ dbuf = (char *) outbuf;
521 uintptr_t extent = md->extent;
522 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
523 uintptr_t res = idx;
524 uintptr_t inner_elements = md->num_elements;
525
526 if (idx >= (count * inner_elements))
527 return;
528
529 uintptr_t x0 = res / inner_elements;
530 res %= inner_elements;
531 inner_elements /= md->u.contig.count;
532
533 uintptr_t x1 = res / inner_elements;
534 res %= inner_elements;
535 inner_elements /= md->u.contig.child->u.blkhindx.count;
536
537 uintptr_t x2 = res / inner_elements;
538 res %= inner_elements;
539 inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
540 uintptr_t x3 = res / inner_elements;
541 res %= inner_elements;
542 inner_elements /= md->u.contig.child->u.blkhindx.child->u.hindexed.count;
543
544 uintptr_t x4;
545 for (int i = 0; i < md->u.contig.child->u.blkhindx.child->u.hindexed.count; i++) {
546 uintptr_t in_elems = md->u.contig.child->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
547 md->u.contig.child->u.blkhindx.child->u.hindexed.child->num_elements;
548 if (res < in_elems) {
549 x4 = i;
550 res %= in_elems;
551 inner_elements = md->u.contig.child->u.blkhindx.child->u.hindexed.child->num_elements;
552 break;
553 } else {
554 res -= in_elems;
555 }
556 }
557
558 uintptr_t x5 = res;
559
560 intptr_t stride1 = md->u.contig.child->extent;
561 intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
562 intptr_t *array_of_displs3 = md->u.contig.child->u.blkhindx.child->u.hindexed.array_of_displs;
563 uintptr_t extent3 = md->u.contig.child->u.blkhindx.child->extent;
564 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(int64_t)));
565 }
566
yaksuri_cudai_pack_contig_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)567 void yaksuri_cudai_pack_contig_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
568 {
569 void *args[] = { &inbuf, &outbuf, &count, &md };
570 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_blkhindx_hindexed_int64_t,
571 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
572 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
573 }
574
yaksuri_cudai_kernel_unpack_contig_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)575 __global__ void yaksuri_cudai_kernel_unpack_contig_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
576 {
577 const char *__restrict__ sbuf = (const char *) inbuf;
578 char *__restrict__ dbuf = (char *) outbuf;
579 uintptr_t extent = md->extent;
580 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
581 uintptr_t res = idx;
582 uintptr_t inner_elements = md->num_elements;
583
584 if (idx >= (count * inner_elements))
585 return;
586
587 uintptr_t x0 = res / inner_elements;
588 res %= inner_elements;
589 inner_elements /= md->u.contig.count;
590
591 uintptr_t x1 = res / inner_elements;
592 res %= inner_elements;
593 inner_elements /= md->u.contig.child->u.blkhindx.count;
594
595 uintptr_t x2 = res / inner_elements;
596 res %= inner_elements;
597 inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
598 uintptr_t x3 = res / inner_elements;
599 res %= inner_elements;
600 inner_elements /= md->u.contig.child->u.blkhindx.child->u.hindexed.count;
601
602 uintptr_t x4;
603 for (int i = 0; i < md->u.contig.child->u.blkhindx.child->u.hindexed.count; i++) {
604 uintptr_t in_elems = md->u.contig.child->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
605 md->u.contig.child->u.blkhindx.child->u.hindexed.child->num_elements;
606 if (res < in_elems) {
607 x4 = i;
608 res %= in_elems;
609 inner_elements = md->u.contig.child->u.blkhindx.child->u.hindexed.child->num_elements;
610 break;
611 } else {
612 res -= in_elems;
613 }
614 }
615
616 uintptr_t x5 = res;
617
618 intptr_t stride1 = md->u.contig.child->extent;
619 intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
620 intptr_t *array_of_displs3 = md->u.contig.child->u.blkhindx.child->u.hindexed.array_of_displs;
621 uintptr_t extent3 = md->u.contig.child->u.blkhindx.child->extent;
622 *((int64_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
623 }
624
yaksuri_cudai_unpack_contig_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)625 void yaksuri_cudai_unpack_contig_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
626 {
627 void *args[] = { &inbuf, &outbuf, &count, &md };
628 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_blkhindx_hindexed_int64_t,
629 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
630 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
631 }
632
yaksuri_cudai_kernel_pack_resized_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)633 __global__ void yaksuri_cudai_kernel_pack_resized_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
634 {
635 const char *__restrict__ sbuf = (const char *) inbuf;
636 char *__restrict__ dbuf = (char *) outbuf;
637 uintptr_t extent = md->extent;
638 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
639 uintptr_t res = idx;
640 uintptr_t inner_elements = md->num_elements;
641
642 if (idx >= (count * inner_elements))
643 return;
644
645 uintptr_t x0 = res / inner_elements;
646 res %= inner_elements;
647 inner_elements /= md->u.resized.child->u.blkhindx.count;
648
649 uintptr_t x1 = res / inner_elements;
650 res %= inner_elements;
651 inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
652 uintptr_t x2 = res / inner_elements;
653 res %= inner_elements;
654 inner_elements /= md->u.resized.child->u.blkhindx.child->u.hindexed.count;
655
656 uintptr_t x3;
657 for (int i = 0; i < md->u.resized.child->u.blkhindx.child->u.hindexed.count; i++) {
658 uintptr_t in_elems = md->u.resized.child->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
659 md->u.resized.child->u.blkhindx.child->u.hindexed.child->num_elements;
660 if (res < in_elems) {
661 x3 = i;
662 res %= in_elems;
663 inner_elements = md->u.resized.child->u.blkhindx.child->u.hindexed.child->num_elements;
664 break;
665 } else {
666 res -= in_elems;
667 }
668 }
669
670 uintptr_t x4 = res;
671
672 intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
673 intptr_t *array_of_displs3 = md->u.resized.child->u.blkhindx.child->u.hindexed.array_of_displs;
674 uintptr_t extent3 = md->u.resized.child->u.blkhindx.child->extent;
675 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(int64_t)));
676 }
677
yaksuri_cudai_pack_resized_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)678 void yaksuri_cudai_pack_resized_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
679 {
680 void *args[] = { &inbuf, &outbuf, &count, &md };
681 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_blkhindx_hindexed_int64_t,
682 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
683 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
684 }
685
yaksuri_cudai_kernel_unpack_resized_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)686 __global__ void yaksuri_cudai_kernel_unpack_resized_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
687 {
688 const char *__restrict__ sbuf = (const char *) inbuf;
689 char *__restrict__ dbuf = (char *) outbuf;
690 uintptr_t extent = md->extent;
691 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
692 uintptr_t res = idx;
693 uintptr_t inner_elements = md->num_elements;
694
695 if (idx >= (count * inner_elements))
696 return;
697
698 uintptr_t x0 = res / inner_elements;
699 res %= inner_elements;
700 inner_elements /= md->u.resized.child->u.blkhindx.count;
701
702 uintptr_t x1 = res / inner_elements;
703 res %= inner_elements;
704 inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
705 uintptr_t x2 = res / inner_elements;
706 res %= inner_elements;
707 inner_elements /= md->u.resized.child->u.blkhindx.child->u.hindexed.count;
708
709 uintptr_t x3;
710 for (int i = 0; i < md->u.resized.child->u.blkhindx.child->u.hindexed.count; i++) {
711 uintptr_t in_elems = md->u.resized.child->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
712 md->u.resized.child->u.blkhindx.child->u.hindexed.child->num_elements;
713 if (res < in_elems) {
714 x3 = i;
715 res %= in_elems;
716 inner_elements = md->u.resized.child->u.blkhindx.child->u.hindexed.child->num_elements;
717 break;
718 } else {
719 res -= in_elems;
720 }
721 }
722
723 uintptr_t x4 = res;
724
725 intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
726 intptr_t *array_of_displs3 = md->u.resized.child->u.blkhindx.child->u.hindexed.array_of_displs;
727 uintptr_t extent3 = md->u.resized.child->u.blkhindx.child->extent;
728 *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
729 }
730
yaksuri_cudai_unpack_resized_blkhindx_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)731 void yaksuri_cudai_unpack_resized_blkhindx_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
732 {
733 void *args[] = { &inbuf, &outbuf, &count, &md };
734 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_blkhindx_hindexed_int64_t,
735 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
736 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
737 }
738
739