1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.hindexed.count;
32
33 uintptr_t x1;
34 for (int i = 0; i < md->u.hindexed.count; i++) {
35 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
36 md->u.hindexed.child->num_elements;
37 if (res < in_elems) {
38 x1 = i;
39 res %= in_elems;
40 inner_elements = md->u.hindexed.child->num_elements;
41 break;
42 } else {
43 res -= in_elems;
44 }
45 }
46
47 uintptr_t x2 = res / inner_elements;
48 res %= inner_elements;
49 inner_elements /= md->u.hindexed.child->u.contig.count;
50
51 uintptr_t x3 = res;
52
53 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
54 intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
55 uintptr_t extent2 = md->u.hindexed.child->extent;
56 *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2));
57 }
58
yaksuri_cudai_pack_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)59 void yaksuri_cudai_pack_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
60 {
61 void *args[] = { &inbuf, &outbuf, &count, &md };
62 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_contig_float,
63 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
64 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
65 }
66
yaksuri_cudai_kernel_unpack_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)67 __global__ void yaksuri_cudai_kernel_unpack_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
68 {
69 const char *__restrict__ sbuf = (const char *) inbuf;
70 char *__restrict__ dbuf = (char *) outbuf;
71 uintptr_t extent = md->extent;
72 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
73 uintptr_t res = idx;
74 uintptr_t inner_elements = md->num_elements;
75
76 if (idx >= (count * inner_elements))
77 return;
78
79 uintptr_t x0 = res / inner_elements;
80 res %= inner_elements;
81 inner_elements /= md->u.hindexed.count;
82
83 uintptr_t x1;
84 for (int i = 0; i < md->u.hindexed.count; i++) {
85 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
86 md->u.hindexed.child->num_elements;
87 if (res < in_elems) {
88 x1 = i;
89 res %= in_elems;
90 inner_elements = md->u.hindexed.child->num_elements;
91 break;
92 } else {
93 res -= in_elems;
94 }
95 }
96
97 uintptr_t x2 = res / inner_elements;
98 res %= inner_elements;
99 inner_elements /= md->u.hindexed.child->u.contig.count;
100
101 uintptr_t x3 = res;
102
103 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
104 intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
105 uintptr_t extent2 = md->u.hindexed.child->extent;
106 *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
107 }
108
yaksuri_cudai_unpack_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)109 void yaksuri_cudai_unpack_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
110 {
111 void *args[] = { &inbuf, &outbuf, &count, &md };
112 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_contig_float,
113 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
114 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
115 }
116
yaksuri_cudai_kernel_pack_hvector_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)117 __global__ void yaksuri_cudai_kernel_pack_hvector_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
118 {
119 const char *__restrict__ sbuf = (const char *) inbuf;
120 char *__restrict__ dbuf = (char *) outbuf;
121 uintptr_t extent = md->extent;
122 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
123 uintptr_t res = idx;
124 uintptr_t inner_elements = md->num_elements;
125
126 if (idx >= (count * inner_elements))
127 return;
128
129 uintptr_t x0 = res / inner_elements;
130 res %= inner_elements;
131 inner_elements /= md->u.hvector.count;
132
133 uintptr_t x1 = res / inner_elements;
134 res %= inner_elements;
135 inner_elements /= md->u.hvector.blocklength;
136 uintptr_t x2 = res / inner_elements;
137 res %= inner_elements;
138 inner_elements /= md->u.hvector.child->u.hindexed.count;
139
140 uintptr_t x3;
141 for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
142 uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
143 md->u.hvector.child->u.hindexed.child->num_elements;
144 if (res < in_elems) {
145 x3 = i;
146 res %= in_elems;
147 inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
148 break;
149 } else {
150 res -= in_elems;
151 }
152 }
153
154 uintptr_t x4 = res / inner_elements;
155 res %= inner_elements;
156 inner_elements /= md->u.hvector.child->u.hindexed.child->u.contig.count;
157
158 uintptr_t x5 = res;
159
160 intptr_t stride1 = md->u.hvector.stride;
161 intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
162 uintptr_t extent2 = md->u.hvector.child->extent;
163 intptr_t stride3 = md->u.hvector.child->u.hindexed.child->u.contig.child->extent;
164 uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
165 *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3));
166 }
167
yaksuri_cudai_pack_hvector_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)168 void yaksuri_cudai_pack_hvector_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
169 {
170 void *args[] = { &inbuf, &outbuf, &count, &md };
171 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_hindexed_contig_float,
172 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
173 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
174 }
175
yaksuri_cudai_kernel_unpack_hvector_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)176 __global__ void yaksuri_cudai_kernel_unpack_hvector_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
177 {
178 const char *__restrict__ sbuf = (const char *) inbuf;
179 char *__restrict__ dbuf = (char *) outbuf;
180 uintptr_t extent = md->extent;
181 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
182 uintptr_t res = idx;
183 uintptr_t inner_elements = md->num_elements;
184
185 if (idx >= (count * inner_elements))
186 return;
187
188 uintptr_t x0 = res / inner_elements;
189 res %= inner_elements;
190 inner_elements /= md->u.hvector.count;
191
192 uintptr_t x1 = res / inner_elements;
193 res %= inner_elements;
194 inner_elements /= md->u.hvector.blocklength;
195 uintptr_t x2 = res / inner_elements;
196 res %= inner_elements;
197 inner_elements /= md->u.hvector.child->u.hindexed.count;
198
199 uintptr_t x3;
200 for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
201 uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
202 md->u.hvector.child->u.hindexed.child->num_elements;
203 if (res < in_elems) {
204 x3 = i;
205 res %= in_elems;
206 inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
207 break;
208 } else {
209 res -= in_elems;
210 }
211 }
212
213 uintptr_t x4 = res / inner_elements;
214 res %= inner_elements;
215 inner_elements /= md->u.hvector.child->u.hindexed.child->u.contig.count;
216
217 uintptr_t x5 = res;
218
219 intptr_t stride1 = md->u.hvector.stride;
220 intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
221 uintptr_t extent2 = md->u.hvector.child->extent;
222 intptr_t stride3 = md->u.hvector.child->u.hindexed.child->u.contig.child->extent;
223 uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
224 *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
225 }
226
yaksuri_cudai_unpack_hvector_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)227 void yaksuri_cudai_unpack_hvector_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
228 {
229 void *args[] = { &inbuf, &outbuf, &count, &md };
230 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_hindexed_contig_float,
231 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
232 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
233 }
234
yaksuri_cudai_kernel_pack_blkhindx_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)235 __global__ void yaksuri_cudai_kernel_pack_blkhindx_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
236 {
237 const char *__restrict__ sbuf = (const char *) inbuf;
238 char *__restrict__ dbuf = (char *) outbuf;
239 uintptr_t extent = md->extent;
240 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
241 uintptr_t res = idx;
242 uintptr_t inner_elements = md->num_elements;
243
244 if (idx >= (count * inner_elements))
245 return;
246
247 uintptr_t x0 = res / inner_elements;
248 res %= inner_elements;
249 inner_elements /= md->u.blkhindx.count;
250
251 uintptr_t x1 = res / inner_elements;
252 res %= inner_elements;
253 inner_elements /= md->u.blkhindx.blocklength;
254 uintptr_t x2 = res / inner_elements;
255 res %= inner_elements;
256 inner_elements /= md->u.blkhindx.child->u.hindexed.count;
257
258 uintptr_t x3;
259 for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
260 uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
261 md->u.blkhindx.child->u.hindexed.child->num_elements;
262 if (res < in_elems) {
263 x3 = i;
264 res %= in_elems;
265 inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
266 break;
267 } else {
268 res -= in_elems;
269 }
270 }
271
272 uintptr_t x4 = res / inner_elements;
273 res %= inner_elements;
274 inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.contig.count;
275
276 uintptr_t x5 = res;
277
278 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
279 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
280 uintptr_t extent2 = md->u.blkhindx.child->extent;
281 intptr_t stride3 = md->u.blkhindx.child->u.hindexed.child->u.contig.child->extent;
282 uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
283 *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3));
284 }
285
yaksuri_cudai_pack_blkhindx_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)286 void yaksuri_cudai_pack_blkhindx_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
287 {
288 void *args[] = { &inbuf, &outbuf, &count, &md };
289 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_hindexed_contig_float,
290 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
291 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
292 }
293
yaksuri_cudai_kernel_unpack_blkhindx_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)294 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
295 {
296 const char *__restrict__ sbuf = (const char *) inbuf;
297 char *__restrict__ dbuf = (char *) outbuf;
298 uintptr_t extent = md->extent;
299 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
300 uintptr_t res = idx;
301 uintptr_t inner_elements = md->num_elements;
302
303 if (idx >= (count * inner_elements))
304 return;
305
306 uintptr_t x0 = res / inner_elements;
307 res %= inner_elements;
308 inner_elements /= md->u.blkhindx.count;
309
310 uintptr_t x1 = res / inner_elements;
311 res %= inner_elements;
312 inner_elements /= md->u.blkhindx.blocklength;
313 uintptr_t x2 = res / inner_elements;
314 res %= inner_elements;
315 inner_elements /= md->u.blkhindx.child->u.hindexed.count;
316
317 uintptr_t x3;
318 for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
319 uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
320 md->u.blkhindx.child->u.hindexed.child->num_elements;
321 if (res < in_elems) {
322 x3 = i;
323 res %= in_elems;
324 inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
325 break;
326 } else {
327 res -= in_elems;
328 }
329 }
330
331 uintptr_t x4 = res / inner_elements;
332 res %= inner_elements;
333 inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.contig.count;
334
335 uintptr_t x5 = res;
336
337 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
338 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
339 uintptr_t extent2 = md->u.blkhindx.child->extent;
340 intptr_t stride3 = md->u.blkhindx.child->u.hindexed.child->u.contig.child->extent;
341 uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
342 *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
343 }
344
yaksuri_cudai_unpack_blkhindx_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)345 void yaksuri_cudai_unpack_blkhindx_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
346 {
347 void *args[] = { &inbuf, &outbuf, &count, &md };
348 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_hindexed_contig_float,
349 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
350 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
351 }
352
yaksuri_cudai_kernel_pack_hindexed_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)353 __global__ void yaksuri_cudai_kernel_pack_hindexed_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
354 {
355 const char *__restrict__ sbuf = (const char *) inbuf;
356 char *__restrict__ dbuf = (char *) outbuf;
357 uintptr_t extent = md->extent;
358 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
359 uintptr_t res = idx;
360 uintptr_t inner_elements = md->num_elements;
361
362 if (idx >= (count * inner_elements))
363 return;
364
365 uintptr_t x0 = res / inner_elements;
366 res %= inner_elements;
367 inner_elements /= md->u.hindexed.count;
368
369 uintptr_t x1;
370 for (int i = 0; i < md->u.hindexed.count; i++) {
371 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
372 md->u.hindexed.child->num_elements;
373 if (res < in_elems) {
374 x1 = i;
375 res %= in_elems;
376 inner_elements = md->u.hindexed.child->num_elements;
377 break;
378 } else {
379 res -= in_elems;
380 }
381 }
382
383 uintptr_t x2 = res / inner_elements;
384 res %= inner_elements;
385 inner_elements /= md->u.hindexed.child->u.hindexed.count;
386
387 uintptr_t x3;
388 for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
389 uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
390 md->u.hindexed.child->u.hindexed.child->num_elements;
391 if (res < in_elems) {
392 x3 = i;
393 res %= in_elems;
394 inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
395 break;
396 } else {
397 res -= in_elems;
398 }
399 }
400
401 uintptr_t x4 = res / inner_elements;
402 res %= inner_elements;
403 inner_elements /= md->u.hindexed.child->u.hindexed.child->u.contig.count;
404
405 uintptr_t x5 = res;
406
407 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
408 intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
409 uintptr_t extent2 = md->u.hindexed.child->extent;
410 intptr_t stride3 = md->u.hindexed.child->u.hindexed.child->u.contig.child->extent;
411 uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
412 *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3));
413 }
414
yaksuri_cudai_pack_hindexed_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)415 void yaksuri_cudai_pack_hindexed_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
416 {
417 void *args[] = { &inbuf, &outbuf, &count, &md };
418 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_hindexed_contig_float,
419 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
420 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
421 }
422
yaksuri_cudai_kernel_unpack_hindexed_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)423 __global__ void yaksuri_cudai_kernel_unpack_hindexed_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
424 {
425 const char *__restrict__ sbuf = (const char *) inbuf;
426 char *__restrict__ dbuf = (char *) outbuf;
427 uintptr_t extent = md->extent;
428 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
429 uintptr_t res = idx;
430 uintptr_t inner_elements = md->num_elements;
431
432 if (idx >= (count * inner_elements))
433 return;
434
435 uintptr_t x0 = res / inner_elements;
436 res %= inner_elements;
437 inner_elements /= md->u.hindexed.count;
438
439 uintptr_t x1;
440 for (int i = 0; i < md->u.hindexed.count; i++) {
441 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
442 md->u.hindexed.child->num_elements;
443 if (res < in_elems) {
444 x1 = i;
445 res %= in_elems;
446 inner_elements = md->u.hindexed.child->num_elements;
447 break;
448 } else {
449 res -= in_elems;
450 }
451 }
452
453 uintptr_t x2 = res / inner_elements;
454 res %= inner_elements;
455 inner_elements /= md->u.hindexed.child->u.hindexed.count;
456
457 uintptr_t x3;
458 for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
459 uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
460 md->u.hindexed.child->u.hindexed.child->num_elements;
461 if (res < in_elems) {
462 x3 = i;
463 res %= in_elems;
464 inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
465 break;
466 } else {
467 res -= in_elems;
468 }
469 }
470
471 uintptr_t x4 = res / inner_elements;
472 res %= inner_elements;
473 inner_elements /= md->u.hindexed.child->u.hindexed.child->u.contig.count;
474
475 uintptr_t x5 = res;
476
477 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
478 intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
479 uintptr_t extent2 = md->u.hindexed.child->extent;
480 intptr_t stride3 = md->u.hindexed.child->u.hindexed.child->u.contig.child->extent;
481 uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
482 *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
483 }
484
yaksuri_cudai_unpack_hindexed_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)485 void yaksuri_cudai_unpack_hindexed_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
486 {
487 void *args[] = { &inbuf, &outbuf, &count, &md };
488 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_hindexed_contig_float,
489 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
490 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
491 }
492
yaksuri_cudai_kernel_pack_contig_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)493 __global__ void yaksuri_cudai_kernel_pack_contig_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
494 {
495 const char *__restrict__ sbuf = (const char *) inbuf;
496 char *__restrict__ dbuf = (char *) outbuf;
497 uintptr_t extent = md->extent;
498 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
499 uintptr_t res = idx;
500 uintptr_t inner_elements = md->num_elements;
501
502 if (idx >= (count * inner_elements))
503 return;
504
505 uintptr_t x0 = res / inner_elements;
506 res %= inner_elements;
507 inner_elements /= md->u.contig.count;
508
509 uintptr_t x1 = res / inner_elements;
510 res %= inner_elements;
511 inner_elements /= md->u.contig.child->u.hindexed.count;
512
513 uintptr_t x2;
514 for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
515 uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
516 md->u.contig.child->u.hindexed.child->num_elements;
517 if (res < in_elems) {
518 x2 = i;
519 res %= in_elems;
520 inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
521 break;
522 } else {
523 res -= in_elems;
524 }
525 }
526
527 uintptr_t x3 = res / inner_elements;
528 res %= inner_elements;
529 inner_elements /= md->u.contig.child->u.hindexed.child->u.contig.count;
530
531 uintptr_t x4 = res;
532
533 intptr_t stride1 = md->u.contig.child->extent;
534 intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
535 intptr_t stride3 = md->u.contig.child->u.hindexed.child->u.contig.child->extent;
536 uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
537 *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + x4 * stride3));
538 }
539
yaksuri_cudai_pack_contig_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)540 void yaksuri_cudai_pack_contig_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
541 {
542 void *args[] = { &inbuf, &outbuf, &count, &md };
543 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_hindexed_contig_float,
544 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
545 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
546 }
547
yaksuri_cudai_kernel_unpack_contig_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)548 __global__ void yaksuri_cudai_kernel_unpack_contig_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
549 {
550 const char *__restrict__ sbuf = (const char *) inbuf;
551 char *__restrict__ dbuf = (char *) outbuf;
552 uintptr_t extent = md->extent;
553 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
554 uintptr_t res = idx;
555 uintptr_t inner_elements = md->num_elements;
556
557 if (idx >= (count * inner_elements))
558 return;
559
560 uintptr_t x0 = res / inner_elements;
561 res %= inner_elements;
562 inner_elements /= md->u.contig.count;
563
564 uintptr_t x1 = res / inner_elements;
565 res %= inner_elements;
566 inner_elements /= md->u.contig.child->u.hindexed.count;
567
568 uintptr_t x2;
569 for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
570 uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
571 md->u.contig.child->u.hindexed.child->num_elements;
572 if (res < in_elems) {
573 x2 = i;
574 res %= in_elems;
575 inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
576 break;
577 } else {
578 res -= in_elems;
579 }
580 }
581
582 uintptr_t x3 = res / inner_elements;
583 res %= inner_elements;
584 inner_elements /= md->u.contig.child->u.hindexed.child->u.contig.count;
585
586 uintptr_t x4 = res;
587
588 intptr_t stride1 = md->u.contig.child->extent;
589 intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
590 intptr_t stride3 = md->u.contig.child->u.hindexed.child->u.contig.child->extent;
591 uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
592 *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + x4 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
593 }
594
yaksuri_cudai_unpack_contig_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)595 void yaksuri_cudai_unpack_contig_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
596 {
597 void *args[] = { &inbuf, &outbuf, &count, &md };
598 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_hindexed_contig_float,
599 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
600 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
601 }
602
yaksuri_cudai_kernel_pack_resized_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)603 __global__ void yaksuri_cudai_kernel_pack_resized_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
604 {
605 const char *__restrict__ sbuf = (const char *) inbuf;
606 char *__restrict__ dbuf = (char *) outbuf;
607 uintptr_t extent = md->extent;
608 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
609 uintptr_t res = idx;
610 uintptr_t inner_elements = md->num_elements;
611
612 if (idx >= (count * inner_elements))
613 return;
614
615 uintptr_t x0 = res / inner_elements;
616 res %= inner_elements;
617 inner_elements /= md->u.resized.child->u.hindexed.count;
618
619 uintptr_t x1;
620 for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
621 uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
622 md->u.resized.child->u.hindexed.child->num_elements;
623 if (res < in_elems) {
624 x1 = i;
625 res %= in_elems;
626 inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
627 break;
628 } else {
629 res -= in_elems;
630 }
631 }
632
633 uintptr_t x2 = res / inner_elements;
634 res %= inner_elements;
635 inner_elements /= md->u.resized.child->u.hindexed.child->u.contig.count;
636
637 uintptr_t x3 = res;
638
639 intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
640 intptr_t stride3 = md->u.resized.child->u.hindexed.child->u.contig.child->extent;
641 uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
642 *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + x3 * stride3));
643 }
644
yaksuri_cudai_pack_resized_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)645 void yaksuri_cudai_pack_resized_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
646 {
647 void *args[] = { &inbuf, &outbuf, &count, &md };
648 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_hindexed_contig_float,
649 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
650 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
651 }
652
yaksuri_cudai_kernel_unpack_resized_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)653 __global__ void yaksuri_cudai_kernel_unpack_resized_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
654 {
655 const char *__restrict__ sbuf = (const char *) inbuf;
656 char *__restrict__ dbuf = (char *) outbuf;
657 uintptr_t extent = md->extent;
658 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
659 uintptr_t res = idx;
660 uintptr_t inner_elements = md->num_elements;
661
662 if (idx >= (count * inner_elements))
663 return;
664
665 uintptr_t x0 = res / inner_elements;
666 res %= inner_elements;
667 inner_elements /= md->u.resized.child->u.hindexed.count;
668
669 uintptr_t x1;
670 for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
671 uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
672 md->u.resized.child->u.hindexed.child->num_elements;
673 if (res < in_elems) {
674 x1 = i;
675 res %= in_elems;
676 inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
677 break;
678 } else {
679 res -= in_elems;
680 }
681 }
682
683 uintptr_t x2 = res / inner_elements;
684 res %= inner_elements;
685 inner_elements /= md->u.resized.child->u.hindexed.child->u.contig.count;
686
687 uintptr_t x3 = res;
688
689 intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
690 intptr_t stride3 = md->u.resized.child->u.hindexed.child->u.contig.child->extent;
691 uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
692 *((float *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + x3 * stride3)) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
693 }
694
yaksuri_cudai_unpack_resized_hindexed_contig_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)695 void yaksuri_cudai_unpack_resized_hindexed_contig_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
696 {
697 void *args[] = { &inbuf, &outbuf, &count, &md };
698 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_hindexed_contig_float,
699 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
700 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
701 }
702
703