1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.hindexed.count;
32
33 uintptr_t x1;
34 for (int i = 0; i < md->u.hindexed.count; i++) {
35 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
36 md->u.hindexed.child->num_elements;
37 if (res < in_elems) {
38 x1 = i;
39 res %= in_elems;
40 inner_elements = md->u.hindexed.child->num_elements;
41 break;
42 } else {
43 res -= in_elems;
44 }
45 }
46
47 uintptr_t x2 = res / inner_elements;
48 res %= inner_elements;
49 inner_elements /= md->u.hindexed.child->u.blkhindx.count;
50
51 uintptr_t x3 = res / inner_elements;
52 res %= inner_elements;
53 inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
54 uintptr_t x4 = res;
55
56 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
57 intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
58 uintptr_t extent2 = md->u.hindexed.child->extent;
59 *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(char)));
60 }
61
yaksuri_cudai_pack_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)62 void yaksuri_cudai_pack_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
63 {
64 void *args[] = { &inbuf, &outbuf, &count, &md };
65 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_blkhindx_char,
66 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
67 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
68 }
69
yaksuri_cudai_kernel_unpack_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)70 __global__ void yaksuri_cudai_kernel_unpack_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
71 {
72 const char *__restrict__ sbuf = (const char *) inbuf;
73 char *__restrict__ dbuf = (char *) outbuf;
74 uintptr_t extent = md->extent;
75 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
76 uintptr_t res = idx;
77 uintptr_t inner_elements = md->num_elements;
78
79 if (idx >= (count * inner_elements))
80 return;
81
82 uintptr_t x0 = res / inner_elements;
83 res %= inner_elements;
84 inner_elements /= md->u.hindexed.count;
85
86 uintptr_t x1;
87 for (int i = 0; i < md->u.hindexed.count; i++) {
88 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
89 md->u.hindexed.child->num_elements;
90 if (res < in_elems) {
91 x1 = i;
92 res %= in_elems;
93 inner_elements = md->u.hindexed.child->num_elements;
94 break;
95 } else {
96 res -= in_elems;
97 }
98 }
99
100 uintptr_t x2 = res / inner_elements;
101 res %= inner_elements;
102 inner_elements /= md->u.hindexed.child->u.blkhindx.count;
103
104 uintptr_t x3 = res / inner_elements;
105 res %= inner_elements;
106 inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
107 uintptr_t x4 = res;
108
109 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
110 intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
111 uintptr_t extent2 = md->u.hindexed.child->extent;
112 *((char *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * sizeof(char))) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
113 }
114
yaksuri_cudai_unpack_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)115 void yaksuri_cudai_unpack_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
116 {
117 void *args[] = { &inbuf, &outbuf, &count, &md };
118 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_blkhindx_char,
119 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
120 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
121 }
122
yaksuri_cudai_kernel_pack_hvector_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)123 __global__ void yaksuri_cudai_kernel_pack_hvector_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
124 {
125 const char *__restrict__ sbuf = (const char *) inbuf;
126 char *__restrict__ dbuf = (char *) outbuf;
127 uintptr_t extent = md->extent;
128 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
129 uintptr_t res = idx;
130 uintptr_t inner_elements = md->num_elements;
131
132 if (idx >= (count * inner_elements))
133 return;
134
135 uintptr_t x0 = res / inner_elements;
136 res %= inner_elements;
137 inner_elements /= md->u.hvector.count;
138
139 uintptr_t x1 = res / inner_elements;
140 res %= inner_elements;
141 inner_elements /= md->u.hvector.blocklength;
142 uintptr_t x2 = res / inner_elements;
143 res %= inner_elements;
144 inner_elements /= md->u.hvector.child->u.hindexed.count;
145
146 uintptr_t x3;
147 for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
148 uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
149 md->u.hvector.child->u.hindexed.child->num_elements;
150 if (res < in_elems) {
151 x3 = i;
152 res %= in_elems;
153 inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
154 break;
155 } else {
156 res -= in_elems;
157 }
158 }
159
160 uintptr_t x4 = res / inner_elements;
161 res %= inner_elements;
162 inner_elements /= md->u.hvector.child->u.hindexed.child->u.blkhindx.count;
163
164 uintptr_t x5 = res / inner_elements;
165 res %= inner_elements;
166 inner_elements /= md->u.hvector.child->u.hindexed.child->u.blkhindx.blocklength;
167 uintptr_t x6 = res;
168
169 intptr_t stride1 = md->u.hvector.stride;
170 intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
171 uintptr_t extent2 = md->u.hvector.child->extent;
172 intptr_t *array_of_displs3 = md->u.hvector.child->u.hindexed.child->u.blkhindx.array_of_displs;
173 uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
174 *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(char)));
175 }
176
yaksuri_cudai_pack_hvector_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)177 void yaksuri_cudai_pack_hvector_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
178 {
179 void *args[] = { &inbuf, &outbuf, &count, &md };
180 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_hindexed_blkhindx_char,
181 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
182 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
183 }
184
yaksuri_cudai_kernel_unpack_hvector_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)185 __global__ void yaksuri_cudai_kernel_unpack_hvector_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
186 {
187 const char *__restrict__ sbuf = (const char *) inbuf;
188 char *__restrict__ dbuf = (char *) outbuf;
189 uintptr_t extent = md->extent;
190 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
191 uintptr_t res = idx;
192 uintptr_t inner_elements = md->num_elements;
193
194 if (idx >= (count * inner_elements))
195 return;
196
197 uintptr_t x0 = res / inner_elements;
198 res %= inner_elements;
199 inner_elements /= md->u.hvector.count;
200
201 uintptr_t x1 = res / inner_elements;
202 res %= inner_elements;
203 inner_elements /= md->u.hvector.blocklength;
204 uintptr_t x2 = res / inner_elements;
205 res %= inner_elements;
206 inner_elements /= md->u.hvector.child->u.hindexed.count;
207
208 uintptr_t x3;
209 for (int i = 0; i < md->u.hvector.child->u.hindexed.count; i++) {
210 uintptr_t in_elems = md->u.hvector.child->u.hindexed.array_of_blocklengths[i] *
211 md->u.hvector.child->u.hindexed.child->num_elements;
212 if (res < in_elems) {
213 x3 = i;
214 res %= in_elems;
215 inner_elements = md->u.hvector.child->u.hindexed.child->num_elements;
216 break;
217 } else {
218 res -= in_elems;
219 }
220 }
221
222 uintptr_t x4 = res / inner_elements;
223 res %= inner_elements;
224 inner_elements /= md->u.hvector.child->u.hindexed.child->u.blkhindx.count;
225
226 uintptr_t x5 = res / inner_elements;
227 res %= inner_elements;
228 inner_elements /= md->u.hvector.child->u.hindexed.child->u.blkhindx.blocklength;
229 uintptr_t x6 = res;
230
231 intptr_t stride1 = md->u.hvector.stride;
232 intptr_t *array_of_displs2 = md->u.hvector.child->u.hindexed.array_of_displs;
233 uintptr_t extent2 = md->u.hvector.child->extent;
234 intptr_t *array_of_displs3 = md->u.hvector.child->u.hindexed.child->u.blkhindx.array_of_displs;
235 uintptr_t extent3 = md->u.hvector.child->u.hindexed.child->extent;
236 *((char *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(char))) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
237 }
238
yaksuri_cudai_unpack_hvector_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)239 void yaksuri_cudai_unpack_hvector_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
240 {
241 void *args[] = { &inbuf, &outbuf, &count, &md };
242 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_hindexed_blkhindx_char,
243 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
244 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
245 }
246
yaksuri_cudai_kernel_pack_blkhindx_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)247 __global__ void yaksuri_cudai_kernel_pack_blkhindx_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
248 {
249 const char *__restrict__ sbuf = (const char *) inbuf;
250 char *__restrict__ dbuf = (char *) outbuf;
251 uintptr_t extent = md->extent;
252 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
253 uintptr_t res = idx;
254 uintptr_t inner_elements = md->num_elements;
255
256 if (idx >= (count * inner_elements))
257 return;
258
259 uintptr_t x0 = res / inner_elements;
260 res %= inner_elements;
261 inner_elements /= md->u.blkhindx.count;
262
263 uintptr_t x1 = res / inner_elements;
264 res %= inner_elements;
265 inner_elements /= md->u.blkhindx.blocklength;
266 uintptr_t x2 = res / inner_elements;
267 res %= inner_elements;
268 inner_elements /= md->u.blkhindx.child->u.hindexed.count;
269
270 uintptr_t x3;
271 for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
272 uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
273 md->u.blkhindx.child->u.hindexed.child->num_elements;
274 if (res < in_elems) {
275 x3 = i;
276 res %= in_elems;
277 inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
278 break;
279 } else {
280 res -= in_elems;
281 }
282 }
283
284 uintptr_t x4 = res / inner_elements;
285 res %= inner_elements;
286 inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.blkhindx.count;
287
288 uintptr_t x5 = res / inner_elements;
289 res %= inner_elements;
290 inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.blkhindx.blocklength;
291 uintptr_t x6 = res;
292
293 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
294 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
295 uintptr_t extent2 = md->u.blkhindx.child->extent;
296 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.hindexed.child->u.blkhindx.array_of_displs;
297 uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
298 *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(char)));
299 }
300
yaksuri_cudai_pack_blkhindx_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)301 void yaksuri_cudai_pack_blkhindx_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
302 {
303 void *args[] = { &inbuf, &outbuf, &count, &md };
304 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_hindexed_blkhindx_char,
305 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
306 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
307 }
308
yaksuri_cudai_kernel_unpack_blkhindx_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)309 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
310 {
311 const char *__restrict__ sbuf = (const char *) inbuf;
312 char *__restrict__ dbuf = (char *) outbuf;
313 uintptr_t extent = md->extent;
314 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
315 uintptr_t res = idx;
316 uintptr_t inner_elements = md->num_elements;
317
318 if (idx >= (count * inner_elements))
319 return;
320
321 uintptr_t x0 = res / inner_elements;
322 res %= inner_elements;
323 inner_elements /= md->u.blkhindx.count;
324
325 uintptr_t x1 = res / inner_elements;
326 res %= inner_elements;
327 inner_elements /= md->u.blkhindx.blocklength;
328 uintptr_t x2 = res / inner_elements;
329 res %= inner_elements;
330 inner_elements /= md->u.blkhindx.child->u.hindexed.count;
331
332 uintptr_t x3;
333 for (int i = 0; i < md->u.blkhindx.child->u.hindexed.count; i++) {
334 uintptr_t in_elems = md->u.blkhindx.child->u.hindexed.array_of_blocklengths[i] *
335 md->u.blkhindx.child->u.hindexed.child->num_elements;
336 if (res < in_elems) {
337 x3 = i;
338 res %= in_elems;
339 inner_elements = md->u.blkhindx.child->u.hindexed.child->num_elements;
340 break;
341 } else {
342 res -= in_elems;
343 }
344 }
345
346 uintptr_t x4 = res / inner_elements;
347 res %= inner_elements;
348 inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.blkhindx.count;
349
350 uintptr_t x5 = res / inner_elements;
351 res %= inner_elements;
352 inner_elements /= md->u.blkhindx.child->u.hindexed.child->u.blkhindx.blocklength;
353 uintptr_t x6 = res;
354
355 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
356 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.hindexed.array_of_displs;
357 uintptr_t extent2 = md->u.blkhindx.child->extent;
358 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.hindexed.child->u.blkhindx.array_of_displs;
359 uintptr_t extent3 = md->u.blkhindx.child->u.hindexed.child->extent;
360 *((char *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(char))) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
361 }
362
yaksuri_cudai_unpack_blkhindx_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)363 void yaksuri_cudai_unpack_blkhindx_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
364 {
365 void *args[] = { &inbuf, &outbuf, &count, &md };
366 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_hindexed_blkhindx_char,
367 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
368 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
369 }
370
yaksuri_cudai_kernel_pack_hindexed_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)371 __global__ void yaksuri_cudai_kernel_pack_hindexed_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
372 {
373 const char *__restrict__ sbuf = (const char *) inbuf;
374 char *__restrict__ dbuf = (char *) outbuf;
375 uintptr_t extent = md->extent;
376 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
377 uintptr_t res = idx;
378 uintptr_t inner_elements = md->num_elements;
379
380 if (idx >= (count * inner_elements))
381 return;
382
383 uintptr_t x0 = res / inner_elements;
384 res %= inner_elements;
385 inner_elements /= md->u.hindexed.count;
386
387 uintptr_t x1;
388 for (int i = 0; i < md->u.hindexed.count; i++) {
389 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
390 md->u.hindexed.child->num_elements;
391 if (res < in_elems) {
392 x1 = i;
393 res %= in_elems;
394 inner_elements = md->u.hindexed.child->num_elements;
395 break;
396 } else {
397 res -= in_elems;
398 }
399 }
400
401 uintptr_t x2 = res / inner_elements;
402 res %= inner_elements;
403 inner_elements /= md->u.hindexed.child->u.hindexed.count;
404
405 uintptr_t x3;
406 for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
407 uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
408 md->u.hindexed.child->u.hindexed.child->num_elements;
409 if (res < in_elems) {
410 x3 = i;
411 res %= in_elems;
412 inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
413 break;
414 } else {
415 res -= in_elems;
416 }
417 }
418
419 uintptr_t x4 = res / inner_elements;
420 res %= inner_elements;
421 inner_elements /= md->u.hindexed.child->u.hindexed.child->u.blkhindx.count;
422
423 uintptr_t x5 = res / inner_elements;
424 res %= inner_elements;
425 inner_elements /= md->u.hindexed.child->u.hindexed.child->u.blkhindx.blocklength;
426 uintptr_t x6 = res;
427
428 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
429 intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
430 uintptr_t extent2 = md->u.hindexed.child->extent;
431 intptr_t *array_of_displs3 = md->u.hindexed.child->u.hindexed.child->u.blkhindx.array_of_displs;
432 uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
433 *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(char)));
434 }
435
yaksuri_cudai_pack_hindexed_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)436 void yaksuri_cudai_pack_hindexed_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
437 {
438 void *args[] = { &inbuf, &outbuf, &count, &md };
439 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_hindexed_blkhindx_char,
440 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
441 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
442 }
443
yaksuri_cudai_kernel_unpack_hindexed_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)444 __global__ void yaksuri_cudai_kernel_unpack_hindexed_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
445 {
446 const char *__restrict__ sbuf = (const char *) inbuf;
447 char *__restrict__ dbuf = (char *) outbuf;
448 uintptr_t extent = md->extent;
449 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
450 uintptr_t res = idx;
451 uintptr_t inner_elements = md->num_elements;
452
453 if (idx >= (count * inner_elements))
454 return;
455
456 uintptr_t x0 = res / inner_elements;
457 res %= inner_elements;
458 inner_elements /= md->u.hindexed.count;
459
460 uintptr_t x1;
461 for (int i = 0; i < md->u.hindexed.count; i++) {
462 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
463 md->u.hindexed.child->num_elements;
464 if (res < in_elems) {
465 x1 = i;
466 res %= in_elems;
467 inner_elements = md->u.hindexed.child->num_elements;
468 break;
469 } else {
470 res -= in_elems;
471 }
472 }
473
474 uintptr_t x2 = res / inner_elements;
475 res %= inner_elements;
476 inner_elements /= md->u.hindexed.child->u.hindexed.count;
477
478 uintptr_t x3;
479 for (int i = 0; i < md->u.hindexed.child->u.hindexed.count; i++) {
480 uintptr_t in_elems = md->u.hindexed.child->u.hindexed.array_of_blocklengths[i] *
481 md->u.hindexed.child->u.hindexed.child->num_elements;
482 if (res < in_elems) {
483 x3 = i;
484 res %= in_elems;
485 inner_elements = md->u.hindexed.child->u.hindexed.child->num_elements;
486 break;
487 } else {
488 res -= in_elems;
489 }
490 }
491
492 uintptr_t x4 = res / inner_elements;
493 res %= inner_elements;
494 inner_elements /= md->u.hindexed.child->u.hindexed.child->u.blkhindx.count;
495
496 uintptr_t x5 = res / inner_elements;
497 res %= inner_elements;
498 inner_elements /= md->u.hindexed.child->u.hindexed.child->u.blkhindx.blocklength;
499 uintptr_t x6 = res;
500
501 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
502 intptr_t *array_of_displs2 = md->u.hindexed.child->u.hindexed.array_of_displs;
503 uintptr_t extent2 = md->u.hindexed.child->extent;
504 intptr_t *array_of_displs3 = md->u.hindexed.child->u.hindexed.child->u.blkhindx.array_of_displs;
505 uintptr_t extent3 = md->u.hindexed.child->u.hindexed.child->extent;
506 *((char *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + array_of_displs3[x5] + x6 * sizeof(char))) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
507 }
508
yaksuri_cudai_unpack_hindexed_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)509 void yaksuri_cudai_unpack_hindexed_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
510 {
511 void *args[] = { &inbuf, &outbuf, &count, &md };
512 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_hindexed_blkhindx_char,
513 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
514 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
515 }
516
yaksuri_cudai_kernel_pack_contig_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)517 __global__ void yaksuri_cudai_kernel_pack_contig_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
518 {
519 const char *__restrict__ sbuf = (const char *) inbuf;
520 char *__restrict__ dbuf = (char *) outbuf;
521 uintptr_t extent = md->extent;
522 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
523 uintptr_t res = idx;
524 uintptr_t inner_elements = md->num_elements;
525
526 if (idx >= (count * inner_elements))
527 return;
528
529 uintptr_t x0 = res / inner_elements;
530 res %= inner_elements;
531 inner_elements /= md->u.contig.count;
532
533 uintptr_t x1 = res / inner_elements;
534 res %= inner_elements;
535 inner_elements /= md->u.contig.child->u.hindexed.count;
536
537 uintptr_t x2;
538 for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
539 uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
540 md->u.contig.child->u.hindexed.child->num_elements;
541 if (res < in_elems) {
542 x2 = i;
543 res %= in_elems;
544 inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
545 break;
546 } else {
547 res -= in_elems;
548 }
549 }
550
551 uintptr_t x3 = res / inner_elements;
552 res %= inner_elements;
553 inner_elements /= md->u.contig.child->u.hindexed.child->u.blkhindx.count;
554
555 uintptr_t x4 = res / inner_elements;
556 res %= inner_elements;
557 inner_elements /= md->u.contig.child->u.hindexed.child->u.blkhindx.blocklength;
558 uintptr_t x5 = res;
559
560 intptr_t stride1 = md->u.contig.child->extent;
561 intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
562 intptr_t *array_of_displs3 = md->u.contig.child->u.hindexed.child->u.blkhindx.array_of_displs;
563 uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
564 *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(char)));
565 }
566
yaksuri_cudai_pack_contig_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)567 void yaksuri_cudai_pack_contig_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
568 {
569 void *args[] = { &inbuf, &outbuf, &count, &md };
570 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_hindexed_blkhindx_char,
571 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
572 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
573 }
574
yaksuri_cudai_kernel_unpack_contig_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)575 __global__ void yaksuri_cudai_kernel_unpack_contig_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
576 {
577 const char *__restrict__ sbuf = (const char *) inbuf;
578 char *__restrict__ dbuf = (char *) outbuf;
579 uintptr_t extent = md->extent;
580 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
581 uintptr_t res = idx;
582 uintptr_t inner_elements = md->num_elements;
583
584 if (idx >= (count * inner_elements))
585 return;
586
587 uintptr_t x0 = res / inner_elements;
588 res %= inner_elements;
589 inner_elements /= md->u.contig.count;
590
591 uintptr_t x1 = res / inner_elements;
592 res %= inner_elements;
593 inner_elements /= md->u.contig.child->u.hindexed.count;
594
595 uintptr_t x2;
596 for (int i = 0; i < md->u.contig.child->u.hindexed.count; i++) {
597 uintptr_t in_elems = md->u.contig.child->u.hindexed.array_of_blocklengths[i] *
598 md->u.contig.child->u.hindexed.child->num_elements;
599 if (res < in_elems) {
600 x2 = i;
601 res %= in_elems;
602 inner_elements = md->u.contig.child->u.hindexed.child->num_elements;
603 break;
604 } else {
605 res -= in_elems;
606 }
607 }
608
609 uintptr_t x3 = res / inner_elements;
610 res %= inner_elements;
611 inner_elements /= md->u.contig.child->u.hindexed.child->u.blkhindx.count;
612
613 uintptr_t x4 = res / inner_elements;
614 res %= inner_elements;
615 inner_elements /= md->u.contig.child->u.hindexed.child->u.blkhindx.blocklength;
616 uintptr_t x5 = res;
617
618 intptr_t stride1 = md->u.contig.child->extent;
619 intptr_t *array_of_displs2 = md->u.contig.child->u.hindexed.array_of_displs;
620 intptr_t *array_of_displs3 = md->u.contig.child->u.hindexed.child->u.blkhindx.array_of_displs;
621 uintptr_t extent3 = md->u.contig.child->u.hindexed.child->extent;
622 *((char *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + array_of_displs3[x4] + x5 * sizeof(char))) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
623 }
624
yaksuri_cudai_unpack_contig_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)625 void yaksuri_cudai_unpack_contig_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
626 {
627 void *args[] = { &inbuf, &outbuf, &count, &md };
628 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_hindexed_blkhindx_char,
629 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
630 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
631 }
632
yaksuri_cudai_kernel_pack_resized_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)633 __global__ void yaksuri_cudai_kernel_pack_resized_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
634 {
635 const char *__restrict__ sbuf = (const char *) inbuf;
636 char *__restrict__ dbuf = (char *) outbuf;
637 uintptr_t extent = md->extent;
638 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
639 uintptr_t res = idx;
640 uintptr_t inner_elements = md->num_elements;
641
642 if (idx >= (count * inner_elements))
643 return;
644
645 uintptr_t x0 = res / inner_elements;
646 res %= inner_elements;
647 inner_elements /= md->u.resized.child->u.hindexed.count;
648
649 uintptr_t x1;
650 for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
651 uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
652 md->u.resized.child->u.hindexed.child->num_elements;
653 if (res < in_elems) {
654 x1 = i;
655 res %= in_elems;
656 inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
657 break;
658 } else {
659 res -= in_elems;
660 }
661 }
662
663 uintptr_t x2 = res / inner_elements;
664 res %= inner_elements;
665 inner_elements /= md->u.resized.child->u.hindexed.child->u.blkhindx.count;
666
667 uintptr_t x3 = res / inner_elements;
668 res %= inner_elements;
669 inner_elements /= md->u.resized.child->u.hindexed.child->u.blkhindx.blocklength;
670 uintptr_t x4 = res;
671
672 intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
673 intptr_t *array_of_displs3 = md->u.resized.child->u.hindexed.child->u.blkhindx.array_of_displs;
674 uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
675 *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(char)));
676 }
677
yaksuri_cudai_pack_resized_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)678 void yaksuri_cudai_pack_resized_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
679 {
680 void *args[] = { &inbuf, &outbuf, &count, &md };
681 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_hindexed_blkhindx_char,
682 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
683 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
684 }
685
yaksuri_cudai_kernel_unpack_resized_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)686 __global__ void yaksuri_cudai_kernel_unpack_resized_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
687 {
688 const char *__restrict__ sbuf = (const char *) inbuf;
689 char *__restrict__ dbuf = (char *) outbuf;
690 uintptr_t extent = md->extent;
691 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
692 uintptr_t res = idx;
693 uintptr_t inner_elements = md->num_elements;
694
695 if (idx >= (count * inner_elements))
696 return;
697
698 uintptr_t x0 = res / inner_elements;
699 res %= inner_elements;
700 inner_elements /= md->u.resized.child->u.hindexed.count;
701
702 uintptr_t x1;
703 for (int i = 0; i < md->u.resized.child->u.hindexed.count; i++) {
704 uintptr_t in_elems = md->u.resized.child->u.hindexed.array_of_blocklengths[i] *
705 md->u.resized.child->u.hindexed.child->num_elements;
706 if (res < in_elems) {
707 x1 = i;
708 res %= in_elems;
709 inner_elements = md->u.resized.child->u.hindexed.child->num_elements;
710 break;
711 } else {
712 res -= in_elems;
713 }
714 }
715
716 uintptr_t x2 = res / inner_elements;
717 res %= inner_elements;
718 inner_elements /= md->u.resized.child->u.hindexed.child->u.blkhindx.count;
719
720 uintptr_t x3 = res / inner_elements;
721 res %= inner_elements;
722 inner_elements /= md->u.resized.child->u.hindexed.child->u.blkhindx.blocklength;
723 uintptr_t x4 = res;
724
725 intptr_t *array_of_displs2 = md->u.resized.child->u.hindexed.array_of_displs;
726 intptr_t *array_of_displs3 = md->u.resized.child->u.hindexed.child->u.blkhindx.array_of_displs;
727 uintptr_t extent3 = md->u.resized.child->u.hindexed.child->extent;
728 *((char *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + array_of_displs3[x3] + x4 * sizeof(char))) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
729 }
730
yaksuri_cudai_unpack_resized_hindexed_blkhindx_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)731 void yaksuri_cudai_unpack_resized_hindexed_blkhindx_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
732 {
733 void *args[] = { &inbuf, &outbuf, &count, &md };
734 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_hindexed_blkhindx_char,
735 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
736 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
737 }
738
739