1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.resized.child->u.blkhindx.count;
32
33 uintptr_t x1 = res / inner_elements;
34 res %= inner_elements;
35 inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
36 uintptr_t x2 = res;
37
38 intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
39 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * sizeof(_Bool)));
40 }
41
yaksuri_cudai_pack_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)42 void yaksuri_cudai_pack_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
43 {
44 void *args[] = { &inbuf, &outbuf, &count, &md };
45 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_blkhindx__Bool,
46 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
47 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
48 }
49
yaksuri_cudai_kernel_unpack_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)50 __global__ void yaksuri_cudai_kernel_unpack_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
51 {
52 const char *__restrict__ sbuf = (const char *) inbuf;
53 char *__restrict__ dbuf = (char *) outbuf;
54 uintptr_t extent = md->extent;
55 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
56 uintptr_t res = idx;
57 uintptr_t inner_elements = md->num_elements;
58
59 if (idx >= (count * inner_elements))
60 return;
61
62 uintptr_t x0 = res / inner_elements;
63 res %= inner_elements;
64 inner_elements /= md->u.resized.child->u.blkhindx.count;
65
66 uintptr_t x1 = res / inner_elements;
67 res %= inner_elements;
68 inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
69 uintptr_t x2 = res;
70
71 intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
72 *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
73 }
74
yaksuri_cudai_unpack_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)75 void yaksuri_cudai_unpack_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
76 {
77 void *args[] = { &inbuf, &outbuf, &count, &md };
78 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_blkhindx__Bool,
79 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
80 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
81 }
82
yaksuri_cudai_kernel_pack_hvector_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)83 __global__ void yaksuri_cudai_kernel_pack_hvector_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
84 {
85 const char *__restrict__ sbuf = (const char *) inbuf;
86 char *__restrict__ dbuf = (char *) outbuf;
87 uintptr_t extent = md->extent;
88 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
89 uintptr_t res = idx;
90 uintptr_t inner_elements = md->num_elements;
91
92 if (idx >= (count * inner_elements))
93 return;
94
95 uintptr_t x0 = res / inner_elements;
96 res %= inner_elements;
97 inner_elements /= md->u.hvector.count;
98
99 uintptr_t x1 = res / inner_elements;
100 res %= inner_elements;
101 inner_elements /= md->u.hvector.blocklength;
102 uintptr_t x2 = res / inner_elements;
103 res %= inner_elements;
104 inner_elements /= md->u.hvector.child->u.resized.child->u.blkhindx.count;
105
106 uintptr_t x3 = res / inner_elements;
107 res %= inner_elements;
108 inner_elements /= md->u.hvector.child->u.resized.child->u.blkhindx.blocklength;
109 uintptr_t x4 = res;
110
111 intptr_t stride1 = md->u.hvector.stride;
112 uintptr_t extent2 = md->u.hvector.child->extent;
113 intptr_t *array_of_displs3 = md->u.hvector.child->u.resized.child->u.blkhindx.array_of_displs;
114 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(_Bool)));
115 }
116
yaksuri_cudai_pack_hvector_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)117 void yaksuri_cudai_pack_hvector_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
118 {
119 void *args[] = { &inbuf, &outbuf, &count, &md };
120 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_resized_blkhindx__Bool,
121 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
122 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
123 }
124
yaksuri_cudai_kernel_unpack_hvector_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)125 __global__ void yaksuri_cudai_kernel_unpack_hvector_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
126 {
127 const char *__restrict__ sbuf = (const char *) inbuf;
128 char *__restrict__ dbuf = (char *) outbuf;
129 uintptr_t extent = md->extent;
130 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
131 uintptr_t res = idx;
132 uintptr_t inner_elements = md->num_elements;
133
134 if (idx >= (count * inner_elements))
135 return;
136
137 uintptr_t x0 = res / inner_elements;
138 res %= inner_elements;
139 inner_elements /= md->u.hvector.count;
140
141 uintptr_t x1 = res / inner_elements;
142 res %= inner_elements;
143 inner_elements /= md->u.hvector.blocklength;
144 uintptr_t x2 = res / inner_elements;
145 res %= inner_elements;
146 inner_elements /= md->u.hvector.child->u.resized.child->u.blkhindx.count;
147
148 uintptr_t x3 = res / inner_elements;
149 res %= inner_elements;
150 inner_elements /= md->u.hvector.child->u.resized.child->u.blkhindx.blocklength;
151 uintptr_t x4 = res;
152
153 intptr_t stride1 = md->u.hvector.stride;
154 uintptr_t extent2 = md->u.hvector.child->extent;
155 intptr_t *array_of_displs3 = md->u.hvector.child->u.resized.child->u.blkhindx.array_of_displs;
156 *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
157 }
158
yaksuri_cudai_unpack_hvector_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)159 void yaksuri_cudai_unpack_hvector_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
160 {
161 void *args[] = { &inbuf, &outbuf, &count, &md };
162 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_resized_blkhindx__Bool,
163 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
164 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
165 }
166
yaksuri_cudai_kernel_pack_blkhindx_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)167 __global__ void yaksuri_cudai_kernel_pack_blkhindx_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
168 {
169 const char *__restrict__ sbuf = (const char *) inbuf;
170 char *__restrict__ dbuf = (char *) outbuf;
171 uintptr_t extent = md->extent;
172 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
173 uintptr_t res = idx;
174 uintptr_t inner_elements = md->num_elements;
175
176 if (idx >= (count * inner_elements))
177 return;
178
179 uintptr_t x0 = res / inner_elements;
180 res %= inner_elements;
181 inner_elements /= md->u.blkhindx.count;
182
183 uintptr_t x1 = res / inner_elements;
184 res %= inner_elements;
185 inner_elements /= md->u.blkhindx.blocklength;
186 uintptr_t x2 = res / inner_elements;
187 res %= inner_elements;
188 inner_elements /= md->u.blkhindx.child->u.resized.child->u.blkhindx.count;
189
190 uintptr_t x3 = res / inner_elements;
191 res %= inner_elements;
192 inner_elements /= md->u.blkhindx.child->u.resized.child->u.blkhindx.blocklength;
193 uintptr_t x4 = res;
194
195 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
196 uintptr_t extent2 = md->u.blkhindx.child->extent;
197 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.resized.child->u.blkhindx.array_of_displs;
198 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(_Bool)));
199 }
200
yaksuri_cudai_pack_blkhindx_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)201 void yaksuri_cudai_pack_blkhindx_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
202 {
203 void *args[] = { &inbuf, &outbuf, &count, &md };
204 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_resized_blkhindx__Bool,
205 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
206 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
207 }
208
yaksuri_cudai_kernel_unpack_blkhindx_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)209 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
210 {
211 const char *__restrict__ sbuf = (const char *) inbuf;
212 char *__restrict__ dbuf = (char *) outbuf;
213 uintptr_t extent = md->extent;
214 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
215 uintptr_t res = idx;
216 uintptr_t inner_elements = md->num_elements;
217
218 if (idx >= (count * inner_elements))
219 return;
220
221 uintptr_t x0 = res / inner_elements;
222 res %= inner_elements;
223 inner_elements /= md->u.blkhindx.count;
224
225 uintptr_t x1 = res / inner_elements;
226 res %= inner_elements;
227 inner_elements /= md->u.blkhindx.blocklength;
228 uintptr_t x2 = res / inner_elements;
229 res %= inner_elements;
230 inner_elements /= md->u.blkhindx.child->u.resized.child->u.blkhindx.count;
231
232 uintptr_t x3 = res / inner_elements;
233 res %= inner_elements;
234 inner_elements /= md->u.blkhindx.child->u.resized.child->u.blkhindx.blocklength;
235 uintptr_t x4 = res;
236
237 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
238 uintptr_t extent2 = md->u.blkhindx.child->extent;
239 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.resized.child->u.blkhindx.array_of_displs;
240 *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
241 }
242
yaksuri_cudai_unpack_blkhindx_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)243 void yaksuri_cudai_unpack_blkhindx_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
244 {
245 void *args[] = { &inbuf, &outbuf, &count, &md };
246 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_resized_blkhindx__Bool,
247 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
248 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
249 }
250
yaksuri_cudai_kernel_pack_hindexed_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)251 __global__ void yaksuri_cudai_kernel_pack_hindexed_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
252 {
253 const char *__restrict__ sbuf = (const char *) inbuf;
254 char *__restrict__ dbuf = (char *) outbuf;
255 uintptr_t extent = md->extent;
256 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
257 uintptr_t res = idx;
258 uintptr_t inner_elements = md->num_elements;
259
260 if (idx >= (count * inner_elements))
261 return;
262
263 uintptr_t x0 = res / inner_elements;
264 res %= inner_elements;
265 inner_elements /= md->u.hindexed.count;
266
267 uintptr_t x1;
268 for (int i = 0; i < md->u.hindexed.count; i++) {
269 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
270 md->u.hindexed.child->num_elements;
271 if (res < in_elems) {
272 x1 = i;
273 res %= in_elems;
274 inner_elements = md->u.hindexed.child->num_elements;
275 break;
276 } else {
277 res -= in_elems;
278 }
279 }
280
281 uintptr_t x2 = res / inner_elements;
282 res %= inner_elements;
283 inner_elements /= md->u.hindexed.child->u.resized.child->u.blkhindx.count;
284
285 uintptr_t x3 = res / inner_elements;
286 res %= inner_elements;
287 inner_elements /= md->u.hindexed.child->u.resized.child->u.blkhindx.blocklength;
288 uintptr_t x4 = res;
289
290 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
291 uintptr_t extent2 = md->u.hindexed.child->extent;
292 intptr_t *array_of_displs3 = md->u.hindexed.child->u.resized.child->u.blkhindx.array_of_displs;
293 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(_Bool)));
294 }
295
yaksuri_cudai_pack_hindexed_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)296 void yaksuri_cudai_pack_hindexed_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
297 {
298 void *args[] = { &inbuf, &outbuf, &count, &md };
299 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_resized_blkhindx__Bool,
300 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
301 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
302 }
303
yaksuri_cudai_kernel_unpack_hindexed_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)304 __global__ void yaksuri_cudai_kernel_unpack_hindexed_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
305 {
306 const char *__restrict__ sbuf = (const char *) inbuf;
307 char *__restrict__ dbuf = (char *) outbuf;
308 uintptr_t extent = md->extent;
309 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
310 uintptr_t res = idx;
311 uintptr_t inner_elements = md->num_elements;
312
313 if (idx >= (count * inner_elements))
314 return;
315
316 uintptr_t x0 = res / inner_elements;
317 res %= inner_elements;
318 inner_elements /= md->u.hindexed.count;
319
320 uintptr_t x1;
321 for (int i = 0; i < md->u.hindexed.count; i++) {
322 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
323 md->u.hindexed.child->num_elements;
324 if (res < in_elems) {
325 x1 = i;
326 res %= in_elems;
327 inner_elements = md->u.hindexed.child->num_elements;
328 break;
329 } else {
330 res -= in_elems;
331 }
332 }
333
334 uintptr_t x2 = res / inner_elements;
335 res %= inner_elements;
336 inner_elements /= md->u.hindexed.child->u.resized.child->u.blkhindx.count;
337
338 uintptr_t x3 = res / inner_elements;
339 res %= inner_elements;
340 inner_elements /= md->u.hindexed.child->u.resized.child->u.blkhindx.blocklength;
341 uintptr_t x4 = res;
342
343 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
344 uintptr_t extent2 = md->u.hindexed.child->extent;
345 intptr_t *array_of_displs3 = md->u.hindexed.child->u.resized.child->u.blkhindx.array_of_displs;
346 *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs3[x3] + x4 * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
347 }
348
yaksuri_cudai_unpack_hindexed_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)349 void yaksuri_cudai_unpack_hindexed_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
350 {
351 void *args[] = { &inbuf, &outbuf, &count, &md };
352 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_resized_blkhindx__Bool,
353 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
354 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
355 }
356
yaksuri_cudai_kernel_pack_contig_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)357 __global__ void yaksuri_cudai_kernel_pack_contig_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
358 {
359 const char *__restrict__ sbuf = (const char *) inbuf;
360 char *__restrict__ dbuf = (char *) outbuf;
361 uintptr_t extent = md->extent;
362 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
363 uintptr_t res = idx;
364 uintptr_t inner_elements = md->num_elements;
365
366 if (idx >= (count * inner_elements))
367 return;
368
369 uintptr_t x0 = res / inner_elements;
370 res %= inner_elements;
371 inner_elements /= md->u.contig.count;
372
373 uintptr_t x1 = res / inner_elements;
374 res %= inner_elements;
375 inner_elements /= md->u.contig.child->u.resized.child->u.blkhindx.count;
376
377 uintptr_t x2 = res / inner_elements;
378 res %= inner_elements;
379 inner_elements /= md->u.contig.child->u.resized.child->u.blkhindx.blocklength;
380 uintptr_t x3 = res;
381
382 intptr_t stride1 = md->u.contig.child->extent;
383 intptr_t *array_of_displs3 = md->u.contig.child->u.resized.child->u.blkhindx.array_of_displs;
384 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs3[x2] + x3 * sizeof(_Bool)));
385 }
386
yaksuri_cudai_pack_contig_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)387 void yaksuri_cudai_pack_contig_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
388 {
389 void *args[] = { &inbuf, &outbuf, &count, &md };
390 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_resized_blkhindx__Bool,
391 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
392 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
393 }
394
yaksuri_cudai_kernel_unpack_contig_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)395 __global__ void yaksuri_cudai_kernel_unpack_contig_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
396 {
397 const char *__restrict__ sbuf = (const char *) inbuf;
398 char *__restrict__ dbuf = (char *) outbuf;
399 uintptr_t extent = md->extent;
400 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
401 uintptr_t res = idx;
402 uintptr_t inner_elements = md->num_elements;
403
404 if (idx >= (count * inner_elements))
405 return;
406
407 uintptr_t x0 = res / inner_elements;
408 res %= inner_elements;
409 inner_elements /= md->u.contig.count;
410
411 uintptr_t x1 = res / inner_elements;
412 res %= inner_elements;
413 inner_elements /= md->u.contig.child->u.resized.child->u.blkhindx.count;
414
415 uintptr_t x2 = res / inner_elements;
416 res %= inner_elements;
417 inner_elements /= md->u.contig.child->u.resized.child->u.blkhindx.blocklength;
418 uintptr_t x3 = res;
419
420 intptr_t stride1 = md->u.contig.child->extent;
421 intptr_t *array_of_displs3 = md->u.contig.child->u.resized.child->u.blkhindx.array_of_displs;
422 *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs3[x2] + x3 * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
423 }
424
yaksuri_cudai_unpack_contig_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)425 void yaksuri_cudai_unpack_contig_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
426 {
427 void *args[] = { &inbuf, &outbuf, &count, &md };
428 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_resized_blkhindx__Bool,
429 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
430 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
431 }
432
yaksuri_cudai_kernel_pack_resized_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)433 __global__ void yaksuri_cudai_kernel_pack_resized_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
434 {
435 const char *__restrict__ sbuf = (const char *) inbuf;
436 char *__restrict__ dbuf = (char *) outbuf;
437 uintptr_t extent = md->extent;
438 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
439 uintptr_t res = idx;
440 uintptr_t inner_elements = md->num_elements;
441
442 if (idx >= (count * inner_elements))
443 return;
444
445 uintptr_t x0 = res / inner_elements;
446 res %= inner_elements;
447 inner_elements /= md->u.resized.child->u.resized.child->u.blkhindx.count;
448
449 uintptr_t x1 = res / inner_elements;
450 res %= inner_elements;
451 inner_elements /= md->u.resized.child->u.resized.child->u.blkhindx.blocklength;
452 uintptr_t x2 = res;
453
454 intptr_t *array_of_displs3 = md->u.resized.child->u.resized.child->u.blkhindx.array_of_displs;
455 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs3[x1] + x2 * sizeof(_Bool)));
456 }
457
yaksuri_cudai_pack_resized_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)458 void yaksuri_cudai_pack_resized_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
459 {
460 void *args[] = { &inbuf, &outbuf, &count, &md };
461 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_resized_blkhindx__Bool,
462 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
463 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
464 }
465
yaksuri_cudai_kernel_unpack_resized_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)466 __global__ void yaksuri_cudai_kernel_unpack_resized_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
467 {
468 const char *__restrict__ sbuf = (const char *) inbuf;
469 char *__restrict__ dbuf = (char *) outbuf;
470 uintptr_t extent = md->extent;
471 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
472 uintptr_t res = idx;
473 uintptr_t inner_elements = md->num_elements;
474
475 if (idx >= (count * inner_elements))
476 return;
477
478 uintptr_t x0 = res / inner_elements;
479 res %= inner_elements;
480 inner_elements /= md->u.resized.child->u.resized.child->u.blkhindx.count;
481
482 uintptr_t x1 = res / inner_elements;
483 res %= inner_elements;
484 inner_elements /= md->u.resized.child->u.resized.child->u.blkhindx.blocklength;
485 uintptr_t x2 = res;
486
487 intptr_t *array_of_displs3 = md->u.resized.child->u.resized.child->u.blkhindx.array_of_displs;
488 *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs3[x1] + x2 * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
489 }
490
yaksuri_cudai_unpack_resized_resized_blkhindx__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)491 void yaksuri_cudai_unpack_resized_resized_blkhindx__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
492 {
493 void *args[] = { &inbuf, &outbuf, &count, &md };
494 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_resized_blkhindx__Bool,
495 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
496 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
497 }
498
499