1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.blkhindx.count;
32
33 uintptr_t x1 = res / inner_elements;
34 res %= inner_elements;
35 inner_elements /= md->u.blkhindx.blocklength;
36 uintptr_t x2 = res;
37
38 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
39 uintptr_t extent2 = md->u.blkhindx.child->extent;
40 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2));
41 }
42
yaksuri_cudai_pack_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)43 void yaksuri_cudai_pack_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
44 {
45 void *args[] = { &inbuf, &outbuf, &count, &md };
46 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_resized__Bool,
47 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
48 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
49 }
50
yaksuri_cudai_kernel_unpack_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)51 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
52 {
53 const char *__restrict__ sbuf = (const char *) inbuf;
54 char *__restrict__ dbuf = (char *) outbuf;
55 uintptr_t extent = md->extent;
56 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
57 uintptr_t res = idx;
58 uintptr_t inner_elements = md->num_elements;
59
60 if (idx >= (count * inner_elements))
61 return;
62
63 uintptr_t x0 = res / inner_elements;
64 res %= inner_elements;
65 inner_elements /= md->u.blkhindx.count;
66
67 uintptr_t x1 = res / inner_elements;
68 res %= inner_elements;
69 inner_elements /= md->u.blkhindx.blocklength;
70 uintptr_t x2 = res;
71
72 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
73 uintptr_t extent2 = md->u.blkhindx.child->extent;
74 *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
75 }
76
yaksuri_cudai_unpack_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)77 void yaksuri_cudai_unpack_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
78 {
79 void *args[] = { &inbuf, &outbuf, &count, &md };
80 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_resized__Bool,
81 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
82 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
83 }
84
yaksuri_cudai_kernel_pack_hvector_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)85 __global__ void yaksuri_cudai_kernel_pack_hvector_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
86 {
87 const char *__restrict__ sbuf = (const char *) inbuf;
88 char *__restrict__ dbuf = (char *) outbuf;
89 uintptr_t extent = md->extent;
90 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
91 uintptr_t res = idx;
92 uintptr_t inner_elements = md->num_elements;
93
94 if (idx >= (count * inner_elements))
95 return;
96
97 uintptr_t x0 = res / inner_elements;
98 res %= inner_elements;
99 inner_elements /= md->u.hvector.count;
100
101 uintptr_t x1 = res / inner_elements;
102 res %= inner_elements;
103 inner_elements /= md->u.hvector.blocklength;
104 uintptr_t x2 = res / inner_elements;
105 res %= inner_elements;
106 inner_elements /= md->u.hvector.child->u.blkhindx.count;
107
108 uintptr_t x3 = res / inner_elements;
109 res %= inner_elements;
110 inner_elements /= md->u.hvector.child->u.blkhindx.blocklength;
111 uintptr_t x4 = res;
112
113 intptr_t stride1 = md->u.hvector.stride;
114 intptr_t *array_of_displs2 = md->u.hvector.child->u.blkhindx.array_of_displs;
115 uintptr_t extent2 = md->u.hvector.child->extent;
116 uintptr_t extent3 = md->u.hvector.child->u.blkhindx.child->extent;
117 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3));
118 }
119
yaksuri_cudai_pack_hvector_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)120 void yaksuri_cudai_pack_hvector_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
121 {
122 void *args[] = { &inbuf, &outbuf, &count, &md };
123 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_blkhindx_resized__Bool,
124 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
125 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
126 }
127
yaksuri_cudai_kernel_unpack_hvector_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)128 __global__ void yaksuri_cudai_kernel_unpack_hvector_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
129 {
130 const char *__restrict__ sbuf = (const char *) inbuf;
131 char *__restrict__ dbuf = (char *) outbuf;
132 uintptr_t extent = md->extent;
133 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
134 uintptr_t res = idx;
135 uintptr_t inner_elements = md->num_elements;
136
137 if (idx >= (count * inner_elements))
138 return;
139
140 uintptr_t x0 = res / inner_elements;
141 res %= inner_elements;
142 inner_elements /= md->u.hvector.count;
143
144 uintptr_t x1 = res / inner_elements;
145 res %= inner_elements;
146 inner_elements /= md->u.hvector.blocklength;
147 uintptr_t x2 = res / inner_elements;
148 res %= inner_elements;
149 inner_elements /= md->u.hvector.child->u.blkhindx.count;
150
151 uintptr_t x3 = res / inner_elements;
152 res %= inner_elements;
153 inner_elements /= md->u.hvector.child->u.blkhindx.blocklength;
154 uintptr_t x4 = res;
155
156 intptr_t stride1 = md->u.hvector.stride;
157 intptr_t *array_of_displs2 = md->u.hvector.child->u.blkhindx.array_of_displs;
158 uintptr_t extent2 = md->u.hvector.child->extent;
159 uintptr_t extent3 = md->u.hvector.child->u.blkhindx.child->extent;
160 *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
161 }
162
yaksuri_cudai_unpack_hvector_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)163 void yaksuri_cudai_unpack_hvector_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
164 {
165 void *args[] = { &inbuf, &outbuf, &count, &md };
166 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_blkhindx_resized__Bool,
167 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
168 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
169 }
170
yaksuri_cudai_kernel_pack_blkhindx_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)171 __global__ void yaksuri_cudai_kernel_pack_blkhindx_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
172 {
173 const char *__restrict__ sbuf = (const char *) inbuf;
174 char *__restrict__ dbuf = (char *) outbuf;
175 uintptr_t extent = md->extent;
176 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
177 uintptr_t res = idx;
178 uintptr_t inner_elements = md->num_elements;
179
180 if (idx >= (count * inner_elements))
181 return;
182
183 uintptr_t x0 = res / inner_elements;
184 res %= inner_elements;
185 inner_elements /= md->u.blkhindx.count;
186
187 uintptr_t x1 = res / inner_elements;
188 res %= inner_elements;
189 inner_elements /= md->u.blkhindx.blocklength;
190 uintptr_t x2 = res / inner_elements;
191 res %= inner_elements;
192 inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
193
194 uintptr_t x3 = res / inner_elements;
195 res %= inner_elements;
196 inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
197 uintptr_t x4 = res;
198
199 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
200 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
201 uintptr_t extent2 = md->u.blkhindx.child->extent;
202 uintptr_t extent3 = md->u.blkhindx.child->u.blkhindx.child->extent;
203 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3));
204 }
205
yaksuri_cudai_pack_blkhindx_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)206 void yaksuri_cudai_pack_blkhindx_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
207 {
208 void *args[] = { &inbuf, &outbuf, &count, &md };
209 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_blkhindx_resized__Bool,
210 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
211 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
212 }
213
yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)214 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
215 {
216 const char *__restrict__ sbuf = (const char *) inbuf;
217 char *__restrict__ dbuf = (char *) outbuf;
218 uintptr_t extent = md->extent;
219 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
220 uintptr_t res = idx;
221 uintptr_t inner_elements = md->num_elements;
222
223 if (idx >= (count * inner_elements))
224 return;
225
226 uintptr_t x0 = res / inner_elements;
227 res %= inner_elements;
228 inner_elements /= md->u.blkhindx.count;
229
230 uintptr_t x1 = res / inner_elements;
231 res %= inner_elements;
232 inner_elements /= md->u.blkhindx.blocklength;
233 uintptr_t x2 = res / inner_elements;
234 res %= inner_elements;
235 inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
236
237 uintptr_t x3 = res / inner_elements;
238 res %= inner_elements;
239 inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
240 uintptr_t x4 = res;
241
242 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
243 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
244 uintptr_t extent2 = md->u.blkhindx.child->extent;
245 uintptr_t extent3 = md->u.blkhindx.child->u.blkhindx.child->extent;
246 *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
247 }
248
yaksuri_cudai_unpack_blkhindx_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)249 void yaksuri_cudai_unpack_blkhindx_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
250 {
251 void *args[] = { &inbuf, &outbuf, &count, &md };
252 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_resized__Bool,
253 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
254 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
255 }
256
yaksuri_cudai_kernel_pack_hindexed_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)257 __global__ void yaksuri_cudai_kernel_pack_hindexed_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
258 {
259 const char *__restrict__ sbuf = (const char *) inbuf;
260 char *__restrict__ dbuf = (char *) outbuf;
261 uintptr_t extent = md->extent;
262 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
263 uintptr_t res = idx;
264 uintptr_t inner_elements = md->num_elements;
265
266 if (idx >= (count * inner_elements))
267 return;
268
269 uintptr_t x0 = res / inner_elements;
270 res %= inner_elements;
271 inner_elements /= md->u.hindexed.count;
272
273 uintptr_t x1;
274 for (int i = 0; i < md->u.hindexed.count; i++) {
275 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
276 md->u.hindexed.child->num_elements;
277 if (res < in_elems) {
278 x1 = i;
279 res %= in_elems;
280 inner_elements = md->u.hindexed.child->num_elements;
281 break;
282 } else {
283 res -= in_elems;
284 }
285 }
286
287 uintptr_t x2 = res / inner_elements;
288 res %= inner_elements;
289 inner_elements /= md->u.hindexed.child->u.blkhindx.count;
290
291 uintptr_t x3 = res / inner_elements;
292 res %= inner_elements;
293 inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
294 uintptr_t x4 = res;
295
296 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
297 intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
298 uintptr_t extent2 = md->u.hindexed.child->extent;
299 uintptr_t extent3 = md->u.hindexed.child->u.blkhindx.child->extent;
300 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3));
301 }
302
yaksuri_cudai_pack_hindexed_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)303 void yaksuri_cudai_pack_hindexed_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
304 {
305 void *args[] = { &inbuf, &outbuf, &count, &md };
306 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_blkhindx_resized__Bool,
307 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
308 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
309 }
310
yaksuri_cudai_kernel_unpack_hindexed_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)311 __global__ void yaksuri_cudai_kernel_unpack_hindexed_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
312 {
313 const char *__restrict__ sbuf = (const char *) inbuf;
314 char *__restrict__ dbuf = (char *) outbuf;
315 uintptr_t extent = md->extent;
316 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
317 uintptr_t res = idx;
318 uintptr_t inner_elements = md->num_elements;
319
320 if (idx >= (count * inner_elements))
321 return;
322
323 uintptr_t x0 = res / inner_elements;
324 res %= inner_elements;
325 inner_elements /= md->u.hindexed.count;
326
327 uintptr_t x1;
328 for (int i = 0; i < md->u.hindexed.count; i++) {
329 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
330 md->u.hindexed.child->num_elements;
331 if (res < in_elems) {
332 x1 = i;
333 res %= in_elems;
334 inner_elements = md->u.hindexed.child->num_elements;
335 break;
336 } else {
337 res -= in_elems;
338 }
339 }
340
341 uintptr_t x2 = res / inner_elements;
342 res %= inner_elements;
343 inner_elements /= md->u.hindexed.child->u.blkhindx.count;
344
345 uintptr_t x3 = res / inner_elements;
346 res %= inner_elements;
347 inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
348 uintptr_t x4 = res;
349
350 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
351 intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
352 uintptr_t extent2 = md->u.hindexed.child->extent;
353 uintptr_t extent3 = md->u.hindexed.child->u.blkhindx.child->extent;
354 *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
355 }
356
yaksuri_cudai_unpack_hindexed_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)357 void yaksuri_cudai_unpack_hindexed_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
358 {
359 void *args[] = { &inbuf, &outbuf, &count, &md };
360 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_blkhindx_resized__Bool,
361 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
362 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
363 }
364
yaksuri_cudai_kernel_pack_contig_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)365 __global__ void yaksuri_cudai_kernel_pack_contig_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
366 {
367 const char *__restrict__ sbuf = (const char *) inbuf;
368 char *__restrict__ dbuf = (char *) outbuf;
369 uintptr_t extent = md->extent;
370 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
371 uintptr_t res = idx;
372 uintptr_t inner_elements = md->num_elements;
373
374 if (idx >= (count * inner_elements))
375 return;
376
377 uintptr_t x0 = res / inner_elements;
378 res %= inner_elements;
379 inner_elements /= md->u.contig.count;
380
381 uintptr_t x1 = res / inner_elements;
382 res %= inner_elements;
383 inner_elements /= md->u.contig.child->u.blkhindx.count;
384
385 uintptr_t x2 = res / inner_elements;
386 res %= inner_elements;
387 inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
388 uintptr_t x3 = res;
389
390 intptr_t stride1 = md->u.contig.child->extent;
391 intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
392 uintptr_t extent3 = md->u.contig.child->u.blkhindx.child->extent;
393 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3));
394 }
395
yaksuri_cudai_pack_contig_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)396 void yaksuri_cudai_pack_contig_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
397 {
398 void *args[] = { &inbuf, &outbuf, &count, &md };
399 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_blkhindx_resized__Bool,
400 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
401 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
402 }
403
yaksuri_cudai_kernel_unpack_contig_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)404 __global__ void yaksuri_cudai_kernel_unpack_contig_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
405 {
406 const char *__restrict__ sbuf = (const char *) inbuf;
407 char *__restrict__ dbuf = (char *) outbuf;
408 uintptr_t extent = md->extent;
409 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
410 uintptr_t res = idx;
411 uintptr_t inner_elements = md->num_elements;
412
413 if (idx >= (count * inner_elements))
414 return;
415
416 uintptr_t x0 = res / inner_elements;
417 res %= inner_elements;
418 inner_elements /= md->u.contig.count;
419
420 uintptr_t x1 = res / inner_elements;
421 res %= inner_elements;
422 inner_elements /= md->u.contig.child->u.blkhindx.count;
423
424 uintptr_t x2 = res / inner_elements;
425 res %= inner_elements;
426 inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
427 uintptr_t x3 = res;
428
429 intptr_t stride1 = md->u.contig.child->extent;
430 intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
431 uintptr_t extent3 = md->u.contig.child->u.blkhindx.child->extent;
432 *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
433 }
434
yaksuri_cudai_unpack_contig_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)435 void yaksuri_cudai_unpack_contig_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
436 {
437 void *args[] = { &inbuf, &outbuf, &count, &md };
438 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_blkhindx_resized__Bool,
439 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
440 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
441 }
442
yaksuri_cudai_kernel_pack_resized_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)443 __global__ void yaksuri_cudai_kernel_pack_resized_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
444 {
445 const char *__restrict__ sbuf = (const char *) inbuf;
446 char *__restrict__ dbuf = (char *) outbuf;
447 uintptr_t extent = md->extent;
448 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
449 uintptr_t res = idx;
450 uintptr_t inner_elements = md->num_elements;
451
452 if (idx >= (count * inner_elements))
453 return;
454
455 uintptr_t x0 = res / inner_elements;
456 res %= inner_elements;
457 inner_elements /= md->u.resized.child->u.blkhindx.count;
458
459 uintptr_t x1 = res / inner_elements;
460 res %= inner_elements;
461 inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
462 uintptr_t x2 = res;
463
464 intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
465 uintptr_t extent3 = md->u.resized.child->u.blkhindx.child->extent;
466 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3));
467 }
468
yaksuri_cudai_pack_resized_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)469 void yaksuri_cudai_pack_resized_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
470 {
471 void *args[] = { &inbuf, &outbuf, &count, &md };
472 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_blkhindx_resized__Bool,
473 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
474 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
475 }
476
yaksuri_cudai_kernel_unpack_resized_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)477 __global__ void yaksuri_cudai_kernel_unpack_resized_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
478 {
479 const char *__restrict__ sbuf = (const char *) inbuf;
480 char *__restrict__ dbuf = (char *) outbuf;
481 uintptr_t extent = md->extent;
482 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
483 uintptr_t res = idx;
484 uintptr_t inner_elements = md->num_elements;
485
486 if (idx >= (count * inner_elements))
487 return;
488
489 uintptr_t x0 = res / inner_elements;
490 res %= inner_elements;
491 inner_elements /= md->u.resized.child->u.blkhindx.count;
492
493 uintptr_t x1 = res / inner_elements;
494 res %= inner_elements;
495 inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
496 uintptr_t x2 = res;
497
498 intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
499 uintptr_t extent3 = md->u.resized.child->u.blkhindx.child->extent;
500 *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
501 }
502
yaksuri_cudai_unpack_resized_blkhindx_resized__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)503 void yaksuri_cudai_unpack_resized_blkhindx_resized__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
504 {
505 void *args[] = { &inbuf, &outbuf, &count, &md };
506 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_blkhindx_resized__Bool,
507 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
508 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
509 }
510
511