1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.contig.count;
32
33 uintptr_t x1 = res / inner_elements;
34 res %= inner_elements;
35 inner_elements /= md->u.contig.child->u.hindexed.count;
36
37 uintptr_t x2;
38 for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
39 uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
40 md->u.contig.child->u.hindexed.child->num_elements;
41 if (res < in_elems) {
42 x2 = i;
43 res %= in_elems;
44 inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
45 break;
46 } else {
47 res -= in_elems;
48 }
49 }
50
51 uintptr_t x3 = res;
52
53 intptr_t stride1 = md->u.contig.child->extent;
54 intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
55 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * sizeof(int64_t)));
56 }
57
yaksuri_cudai_pack_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)58 void yaksuri_cudai_pack_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
59 {
60 void *args[] = { &inbuf, &outbuf, &count, &md };
61 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_hindexed_int64_t,
62 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
63 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
64 }
65
yaksuri_cudai_kernel_unpack_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)66 __global__ void yaksuri_cudai_kernel_unpack_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
67 {
68 const char *__restrict__ sbuf = (const char *) inbuf;
69 char *__restrict__ dbuf = (char *) outbuf;
70 uintptr_t extent = md->extent;
71 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
72 uintptr_t res = idx;
73 uintptr_t inner_elements = md->num_elements;
74
75 if (idx >= (count * inner_elements))
76 return;
77
78 uintptr_t x0 = res / inner_elements;
79 res %= inner_elements;
80 inner_elements /= md->u.contig.count;
81
82 uintptr_t x1 = res / inner_elements;
83 res %= inner_elements;
84 inner_elements /= md->u.contig.child->u.hindexed.count;
85
86 uintptr_t x2;
87 for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
88 uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
89 md->u.contig.child->u.hindexed.child->num_elements;
90 if (res < in_elems) {
91 x2 = i;
92 res %= in_elems;
93 inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
94 break;
95 } else {
96 res -= in_elems;
97 }
98 }
99
100 uintptr_t x3 = res;
101
102 intptr_t stride1 = md->u.contig.child->extent;
103 intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
104 *((int64_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
105 }
106
yaksuri_cudai_unpack_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)107 void yaksuri_cudai_unpack_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
108 {
109 void *args[] = { &inbuf, &outbuf, &count, &md };
110 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_hindexed_int64_t,
111 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
112 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
113 }
114
yaksuri_cudai_kernel_pack_hvector_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)115 __global__ void yaksuri_cudai_kernel_pack_hvector_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
116 {
117 const char *__restrict__ sbuf = (const char *) inbuf;
118 char *__restrict__ dbuf = (char *) outbuf;
119 uintptr_t extent = md->extent;
120 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
121 uintptr_t res = idx;
122 uintptr_t inner_elements = md->num_elements;
123
124 if (idx >= (count * inner_elements))
125 return;
126
127 uintptr_t x0 = res / inner_elements;
128 res %= inner_elements;
129 inner_elements /= md->u.hvector.count;
130
131 uintptr_t x1 = res / inner_elements;
132 res %= inner_elements;
133 inner_elements /= md->u.hvector.blocklength;
134 uintptr_t x2 = res / inner_elements;
135 res %= inner_elements;
136 inner_elements /= md->u.hvector.child->u.contig.count;
137
138 uintptr_t x3 = res / inner_elements;
139 res %= inner_elements;
140 inner_elements /= md->u.hvector.child->u.contig.child->u.hindexed.count;
141
142 uintptr_t x4;
143 for (int i = 0; i < md->u.hvector.child->u.contig.child->u.hindexed.count; i++) {
144 uintptr_t in_elems = md->u.hvector.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
145 md->u.hvector.child->u.contig.child->u.hindexed.child->num_elements;
146 if (res < in_elems) {
147 x4 = i;
148 res %= in_elems;
149 inner_elements = md->u.hvector.child->u.contig.child->u.hindexed.child->num_elements;
150 break;
151 } else {
152 res -= in_elems;
153 }
154 }
155
156 uintptr_t x5 = res;
157
158 intptr_t stride1 = md->u.hvector.stride;
159 intptr_t stride2 = md->u.hvector.child->u.contig.child->extent;
160 uintptr_t extent2 = md->u.hvector.child->extent;
161 intptr_t *array_of_displs3 = md->u.hvector.child->u.contig.child->u.hindexed.array_of_displs;
162 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(int64_t)));
163 }
164
yaksuri_cudai_pack_hvector_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)165 void yaksuri_cudai_pack_hvector_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
166 {
167 void *args[] = { &inbuf, &outbuf, &count, &md };
168 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_contig_hindexed_int64_t,
169 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
170 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
171 }
172
yaksuri_cudai_kernel_unpack_hvector_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)173 __global__ void yaksuri_cudai_kernel_unpack_hvector_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
174 {
175 const char *__restrict__ sbuf = (const char *) inbuf;
176 char *__restrict__ dbuf = (char *) outbuf;
177 uintptr_t extent = md->extent;
178 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
179 uintptr_t res = idx;
180 uintptr_t inner_elements = md->num_elements;
181
182 if (idx >= (count * inner_elements))
183 return;
184
185 uintptr_t x0 = res / inner_elements;
186 res %= inner_elements;
187 inner_elements /= md->u.hvector.count;
188
189 uintptr_t x1 = res / inner_elements;
190 res %= inner_elements;
191 inner_elements /= md->u.hvector.blocklength;
192 uintptr_t x2 = res / inner_elements;
193 res %= inner_elements;
194 inner_elements /= md->u.hvector.child->u.contig.count;
195
196 uintptr_t x3 = res / inner_elements;
197 res %= inner_elements;
198 inner_elements /= md->u.hvector.child->u.contig.child->u.hindexed.count;
199
200 uintptr_t x4;
201 for (int i = 0; i < md->u.hvector.child->u.contig.child->u.hindexed.count; i++) {
202 uintptr_t in_elems = md->u.hvector.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
203 md->u.hvector.child->u.contig.child->u.hindexed.child->num_elements;
204 if (res < in_elems) {
205 x4 = i;
206 res %= in_elems;
207 inner_elements = md->u.hvector.child->u.contig.child->u.hindexed.child->num_elements;
208 break;
209 } else {
210 res -= in_elems;
211 }
212 }
213
214 uintptr_t x5 = res;
215
216 intptr_t stride1 = md->u.hvector.stride;
217 intptr_t stride2 = md->u.hvector.child->u.contig.child->extent;
218 uintptr_t extent2 = md->u.hvector.child->extent;
219 intptr_t *array_of_displs3 = md->u.hvector.child->u.contig.child->u.hindexed.array_of_displs;
220 *((int64_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
221 }
222
yaksuri_cudai_unpack_hvector_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)223 void yaksuri_cudai_unpack_hvector_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
224 {
225 void *args[] = { &inbuf, &outbuf, &count, &md };
226 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_contig_hindexed_int64_t,
227 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
228 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
229 }
230
yaksuri_cudai_kernel_pack_blkhindx_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)231 __global__ void yaksuri_cudai_kernel_pack_blkhindx_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
232 {
233 const char *__restrict__ sbuf = (const char *) inbuf;
234 char *__restrict__ dbuf = (char *) outbuf;
235 uintptr_t extent = md->extent;
236 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
237 uintptr_t res = idx;
238 uintptr_t inner_elements = md->num_elements;
239
240 if (idx >= (count * inner_elements))
241 return;
242
243 uintptr_t x0 = res / inner_elements;
244 res %= inner_elements;
245 inner_elements /= md->u.blkhindx.count;
246
247 uintptr_t x1 = res / inner_elements;
248 res %= inner_elements;
249 inner_elements /= md->u.blkhindx.blocklength;
250 uintptr_t x2 = res / inner_elements;
251 res %= inner_elements;
252 inner_elements /= md->u.blkhindx.child->u.contig.count;
253
254 uintptr_t x3 = res / inner_elements;
255 res %= inner_elements;
256 inner_elements /= md->u.blkhindx.child->u.contig.child->u.hindexed.count;
257
258 uintptr_t x4;
259 for (int i = 0; i < md->u.blkhindx.child->u.contig.child->u.hindexed.count; i++) {
260 uintptr_t in_elems = md->u.blkhindx.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
261 md->u.blkhindx.child->u.contig.child->u.hindexed.child->num_elements;
262 if (res < in_elems) {
263 x4 = i;
264 res %= in_elems;
265 inner_elements = md->u.blkhindx.child->u.contig.child->u.hindexed.child->num_elements;
266 break;
267 } else {
268 res -= in_elems;
269 }
270 }
271
272 uintptr_t x5 = res;
273
274 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
275 intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
276 uintptr_t extent2 = md->u.blkhindx.child->extent;
277 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.contig.child->u.hindexed.array_of_displs;
278 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(int64_t)));
279 }
280
yaksuri_cudai_pack_blkhindx_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)281 void yaksuri_cudai_pack_blkhindx_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
282 {
283 void *args[] = { &inbuf, &outbuf, &count, &md };
284 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_contig_hindexed_int64_t,
285 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
286 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
287 }
288
yaksuri_cudai_kernel_unpack_blkhindx_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)289 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
290 {
291 const char *__restrict__ sbuf = (const char *) inbuf;
292 char *__restrict__ dbuf = (char *) outbuf;
293 uintptr_t extent = md->extent;
294 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
295 uintptr_t res = idx;
296 uintptr_t inner_elements = md->num_elements;
297
298 if (idx >= (count * inner_elements))
299 return;
300
301 uintptr_t x0 = res / inner_elements;
302 res %= inner_elements;
303 inner_elements /= md->u.blkhindx.count;
304
305 uintptr_t x1 = res / inner_elements;
306 res %= inner_elements;
307 inner_elements /= md->u.blkhindx.blocklength;
308 uintptr_t x2 = res / inner_elements;
309 res %= inner_elements;
310 inner_elements /= md->u.blkhindx.child->u.contig.count;
311
312 uintptr_t x3 = res / inner_elements;
313 res %= inner_elements;
314 inner_elements /= md->u.blkhindx.child->u.contig.child->u.hindexed.count;
315
316 uintptr_t x4;
317 for (int i = 0; i < md->u.blkhindx.child->u.contig.child->u.hindexed.count; i++) {
318 uintptr_t in_elems = md->u.blkhindx.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
319 md->u.blkhindx.child->u.contig.child->u.hindexed.child->num_elements;
320 if (res < in_elems) {
321 x4 = i;
322 res %= in_elems;
323 inner_elements = md->u.blkhindx.child->u.contig.child->u.hindexed.child->num_elements;
324 break;
325 } else {
326 res -= in_elems;
327 }
328 }
329
330 uintptr_t x5 = res;
331
332 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
333 intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
334 uintptr_t extent2 = md->u.blkhindx.child->extent;
335 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.contig.child->u.hindexed.array_of_displs;
336 *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
337 }
338
yaksuri_cudai_unpack_blkhindx_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)339 void yaksuri_cudai_unpack_blkhindx_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
340 {
341 void *args[] = { &inbuf, &outbuf, &count, &md };
342 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_contig_hindexed_int64_t,
343 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
344 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
345 }
346
yaksuri_cudai_kernel_pack_hindexed_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)347 __global__ void yaksuri_cudai_kernel_pack_hindexed_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
348 {
349 const char *__restrict__ sbuf = (const char *) inbuf;
350 char *__restrict__ dbuf = (char *) outbuf;
351 uintptr_t extent = md->extent;
352 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
353 uintptr_t res = idx;
354 uintptr_t inner_elements = md->num_elements;
355
356 if (idx >= (count * inner_elements))
357 return;
358
359 uintptr_t x0 = res / inner_elements;
360 res %= inner_elements;
361 inner_elements /= md->u.hindexed.count;
362
363 uintptr_t x1;
364 for (int i = 0; i < md->u.hindexed.count; i++) {
365 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
366 md->u.hindexed.child->num_elements;
367 if (res < in_elems) {
368 x1 = i;
369 res %= in_elems;
370 inner_elements = md->u.hindexed.child->num_elements;
371 break;
372 } else {
373 res -= in_elems;
374 }
375 }
376
377 uintptr_t x2 = res / inner_elements;
378 res %= inner_elements;
379 inner_elements /= md->u.hindexed.child->u.contig.count;
380
381 uintptr_t x3 = res / inner_elements;
382 res %= inner_elements;
383 inner_elements /= md->u.hindexed.child->u.contig.child->u.hindexed.count;
384
385 uintptr_t x4;
386 for (int i = 0; i < md->u.hindexed.child->u.contig.child->u.hindexed.count; i++) {
387 uintptr_t in_elems = md->u.hindexed.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
388 md->u.hindexed.child->u.contig.child->u.hindexed.child->num_elements;
389 if (res < in_elems) {
390 x4 = i;
391 res %= in_elems;
392 inner_elements = md->u.hindexed.child->u.contig.child->u.hindexed.child->num_elements;
393 break;
394 } else {
395 res -= in_elems;
396 }
397 }
398
399 uintptr_t x5 = res;
400
401 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
402 intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
403 uintptr_t extent2 = md->u.hindexed.child->extent;
404 intptr_t *array_of_displs3 = md->u.hindexed.child->u.contig.child->u.hindexed.array_of_displs;
405 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(int64_t)));
406 }
407
yaksuri_cudai_pack_hindexed_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)408 void yaksuri_cudai_pack_hindexed_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
409 {
410 void *args[] = { &inbuf, &outbuf, &count, &md };
411 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_contig_hindexed_int64_t,
412 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
413 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
414 }
415
yaksuri_cudai_kernel_unpack_hindexed_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)416 __global__ void yaksuri_cudai_kernel_unpack_hindexed_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
417 {
418 const char *__restrict__ sbuf = (const char *) inbuf;
419 char *__restrict__ dbuf = (char *) outbuf;
420 uintptr_t extent = md->extent;
421 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
422 uintptr_t res = idx;
423 uintptr_t inner_elements = md->num_elements;
424
425 if (idx >= (count * inner_elements))
426 return;
427
428 uintptr_t x0 = res / inner_elements;
429 res %= inner_elements;
430 inner_elements /= md->u.hindexed.count;
431
432 uintptr_t x1;
433 for (int i = 0; i < md->u.hindexed.count; i++) {
434 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
435 md->u.hindexed.child->num_elements;
436 if (res < in_elems) {
437 x1 = i;
438 res %= in_elems;
439 inner_elements = md->u.hindexed.child->num_elements;
440 break;
441 } else {
442 res -= in_elems;
443 }
444 }
445
446 uintptr_t x2 = res / inner_elements;
447 res %= inner_elements;
448 inner_elements /= md->u.hindexed.child->u.contig.count;
449
450 uintptr_t x3 = res / inner_elements;
451 res %= inner_elements;
452 inner_elements /= md->u.hindexed.child->u.contig.child->u.hindexed.count;
453
454 uintptr_t x4;
455 for (int i = 0; i < md->u.hindexed.child->u.contig.child->u.hindexed.count; i++) {
456 uintptr_t in_elems = md->u.hindexed.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
457 md->u.hindexed.child->u.contig.child->u.hindexed.child->num_elements;
458 if (res < in_elems) {
459 x4 = i;
460 res %= in_elems;
461 inner_elements = md->u.hindexed.child->u.contig.child->u.hindexed.child->num_elements;
462 break;
463 } else {
464 res -= in_elems;
465 }
466 }
467
468 uintptr_t x5 = res;
469
470 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
471 intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
472 uintptr_t extent2 = md->u.hindexed.child->extent;
473 intptr_t *array_of_displs3 = md->u.hindexed.child->u.contig.child->u.hindexed.array_of_displs;
474 *((int64_t *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
475 }
476
yaksuri_cudai_unpack_hindexed_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)477 void yaksuri_cudai_unpack_hindexed_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
478 {
479 void *args[] = { &inbuf, &outbuf, &count, &md };
480 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_contig_hindexed_int64_t,
481 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
482 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
483 }
484
yaksuri_cudai_kernel_pack_contig_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)485 __global__ void yaksuri_cudai_kernel_pack_contig_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
486 {
487 const char *__restrict__ sbuf = (const char *) inbuf;
488 char *__restrict__ dbuf = (char *) outbuf;
489 uintptr_t extent = md->extent;
490 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
491 uintptr_t res = idx;
492 uintptr_t inner_elements = md->num_elements;
493
494 if (idx >= (count * inner_elements))
495 return;
496
497 uintptr_t x0 = res / inner_elements;
498 res %= inner_elements;
499 inner_elements /= md->u.contig.count;
500
501 uintptr_t x1 = res / inner_elements;
502 res %= inner_elements;
503 inner_elements /= md->u.contig.child->u.contig.count;
504
505 uintptr_t x2 = res / inner_elements;
506 res %= inner_elements;
507 inner_elements /= md->u.contig.child->u.contig.child->u.hindexed.count;
508
509 uintptr_t x3;
510 for (int i = 0; i < md->u.contig.child->u.contig.child->u.hindexed.count; i++) {
511 uintptr_t in_elems = md->u.contig.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
512 md->u.contig.child->u.contig.child->u.hindexed.child->num_elements;
513 if (res < in_elems) {
514 x3 = i;
515 res %= in_elems;
516 inner_elements = md->u.contig.child->u.contig.child->u.hindexed.child->num_elements;
517 break;
518 } else {
519 res -= in_elems;
520 }
521 }
522
523 uintptr_t x4 = res;
524
525 intptr_t stride1 = md->u.contig.child->extent;
526 intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
527 intptr_t *array_of_displs3 = md->u.contig.child->u.contig.child->u.hindexed.array_of_displs;
528 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * stride2 + array_of_displs3[x3] + x4 * sizeof(int64_t)));
529 }
530
yaksuri_cudai_pack_contig_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)531 void yaksuri_cudai_pack_contig_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
532 {
533 void *args[] = { &inbuf, &outbuf, &count, &md };
534 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_contig_hindexed_int64_t,
535 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
536 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
537 }
538
yaksuri_cudai_kernel_unpack_contig_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)539 __global__ void yaksuri_cudai_kernel_unpack_contig_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
540 {
541 const char *__restrict__ sbuf = (const char *) inbuf;
542 char *__restrict__ dbuf = (char *) outbuf;
543 uintptr_t extent = md->extent;
544 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
545 uintptr_t res = idx;
546 uintptr_t inner_elements = md->num_elements;
547
548 if (idx >= (count * inner_elements))
549 return;
550
551 uintptr_t x0 = res / inner_elements;
552 res %= inner_elements;
553 inner_elements /= md->u.contig.count;
554
555 uintptr_t x1 = res / inner_elements;
556 res %= inner_elements;
557 inner_elements /= md->u.contig.child->u.contig.count;
558
559 uintptr_t x2 = res / inner_elements;
560 res %= inner_elements;
561 inner_elements /= md->u.contig.child->u.contig.child->u.hindexed.count;
562
563 uintptr_t x3;
564 for (int i = 0; i < md->u.contig.child->u.contig.child->u.hindexed.count; i++) {
565 uintptr_t in_elems = md->u.contig.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
566 md->u.contig.child->u.contig.child->u.hindexed.child->num_elements;
567 if (res < in_elems) {
568 x3 = i;
569 res %= in_elems;
570 inner_elements = md->u.contig.child->u.contig.child->u.hindexed.child->num_elements;
571 break;
572 } else {
573 res -= in_elems;
574 }
575 }
576
577 uintptr_t x4 = res;
578
579 intptr_t stride1 = md->u.contig.child->extent;
580 intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
581 intptr_t *array_of_displs3 = md->u.contig.child->u.contig.child->u.hindexed.array_of_displs;
582 *((int64_t *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * stride2 + array_of_displs3[x3] + x4 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
583 }
584
yaksuri_cudai_unpack_contig_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)585 void yaksuri_cudai_unpack_contig_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
586 {
587 void *args[] = { &inbuf, &outbuf, &count, &md };
588 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_contig_hindexed_int64_t,
589 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
590 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
591 }
592
yaksuri_cudai_kernel_pack_resized_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)593 __global__ void yaksuri_cudai_kernel_pack_resized_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
594 {
595 const char *__restrict__ sbuf = (const char *) inbuf;
596 char *__restrict__ dbuf = (char *) outbuf;
597 uintptr_t extent = md->extent;
598 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
599 uintptr_t res = idx;
600 uintptr_t inner_elements = md->num_elements;
601
602 if (idx >= (count * inner_elements))
603 return;
604
605 uintptr_t x0 = res / inner_elements;
606 res %= inner_elements;
607 inner_elements /= md->u.resized.child->u.contig.count;
608
609 uintptr_t x1 = res / inner_elements;
610 res %= inner_elements;
611 inner_elements /= md->u.resized.child->u.contig.child->u.hindexed.count;
612
613 uintptr_t x2;
614 for (int i = 0; i < md->u.resized.child->u.contig.child->u.hindexed.count; i++) {
615 uintptr_t in_elems = md->u.resized.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
616 md->u.resized.child->u.contig.child->u.hindexed.child->num_elements;
617 if (res < in_elems) {
618 x2 = i;
619 res %= in_elems;
620 inner_elements = md->u.resized.child->u.contig.child->u.hindexed.child->num_elements;
621 break;
622 } else {
623 res -= in_elems;
624 }
625 }
626
627 uintptr_t x3 = res;
628
629 intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
630 intptr_t *array_of_displs3 = md->u.resized.child->u.contig.child->u.hindexed.array_of_displs;
631 *((int64_t *) (void *) (dbuf + idx * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + x0 * extent + x1 * stride2 + array_of_displs3[x2] + x3 * sizeof(int64_t)));
632 }
633
yaksuri_cudai_pack_resized_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)634 void yaksuri_cudai_pack_resized_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
635 {
636 void *args[] = { &inbuf, &outbuf, &count, &md };
637 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_contig_hindexed_int64_t,
638 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
639 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
640 }
641
yaksuri_cudai_kernel_unpack_resized_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)642 __global__ void yaksuri_cudai_kernel_unpack_resized_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
643 {
644 const char *__restrict__ sbuf = (const char *) inbuf;
645 char *__restrict__ dbuf = (char *) outbuf;
646 uintptr_t extent = md->extent;
647 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
648 uintptr_t res = idx;
649 uintptr_t inner_elements = md->num_elements;
650
651 if (idx >= (count * inner_elements))
652 return;
653
654 uintptr_t x0 = res / inner_elements;
655 res %= inner_elements;
656 inner_elements /= md->u.resized.child->u.contig.count;
657
658 uintptr_t x1 = res / inner_elements;
659 res %= inner_elements;
660 inner_elements /= md->u.resized.child->u.contig.child->u.hindexed.count;
661
662 uintptr_t x2;
663 for (int i = 0; i < md->u.resized.child->u.contig.child->u.hindexed.count; i++) {
664 uintptr_t in_elems = md->u.resized.child->u.contig.child->u.hindexed.array_of_blocklengths[i] *
665 md->u.resized.child->u.contig.child->u.hindexed.child->num_elements;
666 if (res < in_elems) {
667 x2 = i;
668 res %= in_elems;
669 inner_elements = md->u.resized.child->u.contig.child->u.hindexed.child->num_elements;
670 break;
671 } else {
672 res -= in_elems;
673 }
674 }
675
676 uintptr_t x3 = res;
677
678 intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
679 intptr_t *array_of_displs3 = md->u.resized.child->u.contig.child->u.hindexed.array_of_displs;
680 *((int64_t *) (void *) (dbuf + x0 * extent + x1 * stride2 + array_of_displs3[x2] + x3 * sizeof(int64_t))) = *((const int64_t *) (const void *) (sbuf + idx * sizeof(int64_t)));
681 }
682
yaksuri_cudai_unpack_resized_contig_hindexed_int64_t(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)683 void yaksuri_cudai_unpack_resized_contig_hindexed_int64_t(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
684 {
685 void *args[] = { &inbuf, &outbuf, &count, &md };
686 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_contig_hindexed_int64_t,
687 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
688 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
689 }
690
691