1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.resized.child->u.contig.count;
32
33 uintptr_t x1 = res;
34
35 intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
36 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride2));
37 }
38
yaksuri_cudai_pack_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)39 void yaksuri_cudai_pack_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
40 {
41 void *args[] = { &inbuf, &outbuf, &count, &md };
42 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_contig__Bool,
43 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
44 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
45 }
46
yaksuri_cudai_kernel_unpack_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)47 __global__ void yaksuri_cudai_kernel_unpack_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
48 {
49 const char *__restrict__ sbuf = (const char *) inbuf;
50 char *__restrict__ dbuf = (char *) outbuf;
51 uintptr_t extent = md->extent;
52 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
53 uintptr_t res = idx;
54 uintptr_t inner_elements = md->num_elements;
55
56 if (idx >= (count * inner_elements))
57 return;
58
59 uintptr_t x0 = res / inner_elements;
60 res %= inner_elements;
61 inner_elements /= md->u.resized.child->u.contig.count;
62
63 uintptr_t x1 = res;
64
65 intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
66 *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride2)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
67 }
68
yaksuri_cudai_unpack_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)69 void yaksuri_cudai_unpack_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
70 {
71 void *args[] = { &inbuf, &outbuf, &count, &md };
72 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_contig__Bool,
73 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
74 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
75 }
76
yaksuri_cudai_kernel_pack_hvector_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)77 __global__ void yaksuri_cudai_kernel_pack_hvector_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
78 {
79 const char *__restrict__ sbuf = (const char *) inbuf;
80 char *__restrict__ dbuf = (char *) outbuf;
81 uintptr_t extent = md->extent;
82 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
83 uintptr_t res = idx;
84 uintptr_t inner_elements = md->num_elements;
85
86 if (idx >= (count * inner_elements))
87 return;
88
89 uintptr_t x0 = res / inner_elements;
90 res %= inner_elements;
91 inner_elements /= md->u.hvector.count;
92
93 uintptr_t x1 = res / inner_elements;
94 res %= inner_elements;
95 inner_elements /= md->u.hvector.blocklength;
96 uintptr_t x2 = res / inner_elements;
97 res %= inner_elements;
98 inner_elements /= md->u.hvector.child->u.resized.child->u.contig.count;
99
100 uintptr_t x3 = res;
101
102 intptr_t stride1 = md->u.hvector.stride;
103 uintptr_t extent2 = md->u.hvector.child->extent;
104 intptr_t stride3 = md->u.hvector.child->u.resized.child->u.contig.child->extent;
105 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride3));
106 }
107
yaksuri_cudai_pack_hvector_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)108 void yaksuri_cudai_pack_hvector_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
109 {
110 void *args[] = { &inbuf, &outbuf, &count, &md };
111 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_resized_contig__Bool,
112 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
113 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
114 }
115
yaksuri_cudai_kernel_unpack_hvector_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)116 __global__ void yaksuri_cudai_kernel_unpack_hvector_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
117 {
118 const char *__restrict__ sbuf = (const char *) inbuf;
119 char *__restrict__ dbuf = (char *) outbuf;
120 uintptr_t extent = md->extent;
121 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
122 uintptr_t res = idx;
123 uintptr_t inner_elements = md->num_elements;
124
125 if (idx >= (count * inner_elements))
126 return;
127
128 uintptr_t x0 = res / inner_elements;
129 res %= inner_elements;
130 inner_elements /= md->u.hvector.count;
131
132 uintptr_t x1 = res / inner_elements;
133 res %= inner_elements;
134 inner_elements /= md->u.hvector.blocklength;
135 uintptr_t x2 = res / inner_elements;
136 res %= inner_elements;
137 inner_elements /= md->u.hvector.child->u.resized.child->u.contig.count;
138
139 uintptr_t x3 = res;
140
141 intptr_t stride1 = md->u.hvector.stride;
142 uintptr_t extent2 = md->u.hvector.child->extent;
143 intptr_t stride3 = md->u.hvector.child->u.resized.child->u.contig.child->extent;
144 *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
145 }
146
yaksuri_cudai_unpack_hvector_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)147 void yaksuri_cudai_unpack_hvector_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
148 {
149 void *args[] = { &inbuf, &outbuf, &count, &md };
150 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_resized_contig__Bool,
151 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
152 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
153 }
154
yaksuri_cudai_kernel_pack_blkhindx_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)155 __global__ void yaksuri_cudai_kernel_pack_blkhindx_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
156 {
157 const char *__restrict__ sbuf = (const char *) inbuf;
158 char *__restrict__ dbuf = (char *) outbuf;
159 uintptr_t extent = md->extent;
160 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
161 uintptr_t res = idx;
162 uintptr_t inner_elements = md->num_elements;
163
164 if (idx >= (count * inner_elements))
165 return;
166
167 uintptr_t x0 = res / inner_elements;
168 res %= inner_elements;
169 inner_elements /= md->u.blkhindx.count;
170
171 uintptr_t x1 = res / inner_elements;
172 res %= inner_elements;
173 inner_elements /= md->u.blkhindx.blocklength;
174 uintptr_t x2 = res / inner_elements;
175 res %= inner_elements;
176 inner_elements /= md->u.blkhindx.child->u.resized.child->u.contig.count;
177
178 uintptr_t x3 = res;
179
180 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
181 uintptr_t extent2 = md->u.blkhindx.child->extent;
182 intptr_t stride3 = md->u.blkhindx.child->u.resized.child->u.contig.child->extent;
183 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride3));
184 }
185
yaksuri_cudai_pack_blkhindx_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)186 void yaksuri_cudai_pack_blkhindx_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
187 {
188 void *args[] = { &inbuf, &outbuf, &count, &md };
189 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_resized_contig__Bool,
190 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
191 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
192 }
193
yaksuri_cudai_kernel_unpack_blkhindx_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)194 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
195 {
196 const char *__restrict__ sbuf = (const char *) inbuf;
197 char *__restrict__ dbuf = (char *) outbuf;
198 uintptr_t extent = md->extent;
199 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
200 uintptr_t res = idx;
201 uintptr_t inner_elements = md->num_elements;
202
203 if (idx >= (count * inner_elements))
204 return;
205
206 uintptr_t x0 = res / inner_elements;
207 res %= inner_elements;
208 inner_elements /= md->u.blkhindx.count;
209
210 uintptr_t x1 = res / inner_elements;
211 res %= inner_elements;
212 inner_elements /= md->u.blkhindx.blocklength;
213 uintptr_t x2 = res / inner_elements;
214 res %= inner_elements;
215 inner_elements /= md->u.blkhindx.child->u.resized.child->u.contig.count;
216
217 uintptr_t x3 = res;
218
219 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
220 uintptr_t extent2 = md->u.blkhindx.child->extent;
221 intptr_t stride3 = md->u.blkhindx.child->u.resized.child->u.contig.child->extent;
222 *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
223 }
224
yaksuri_cudai_unpack_blkhindx_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)225 void yaksuri_cudai_unpack_blkhindx_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
226 {
227 void *args[] = { &inbuf, &outbuf, &count, &md };
228 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_resized_contig__Bool,
229 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
230 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
231 }
232
yaksuri_cudai_kernel_pack_hindexed_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)233 __global__ void yaksuri_cudai_kernel_pack_hindexed_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
234 {
235 const char *__restrict__ sbuf = (const char *) inbuf;
236 char *__restrict__ dbuf = (char *) outbuf;
237 uintptr_t extent = md->extent;
238 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
239 uintptr_t res = idx;
240 uintptr_t inner_elements = md->num_elements;
241
242 if (idx >= (count * inner_elements))
243 return;
244
245 uintptr_t x0 = res / inner_elements;
246 res %= inner_elements;
247 inner_elements /= md->u.hindexed.count;
248
249 uintptr_t x1;
250 for (int i = 0; i < md->u.hindexed.count; i++) {
251 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
252 md->u.hindexed.child->num_elements;
253 if (res < in_elems) {
254 x1 = i;
255 res %= in_elems;
256 inner_elements = md->u.hindexed.child->num_elements;
257 break;
258 } else {
259 res -= in_elems;
260 }
261 }
262
263 uintptr_t x2 = res / inner_elements;
264 res %= inner_elements;
265 inner_elements /= md->u.hindexed.child->u.resized.child->u.contig.count;
266
267 uintptr_t x3 = res;
268
269 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
270 uintptr_t extent2 = md->u.hindexed.child->extent;
271 intptr_t stride3 = md->u.hindexed.child->u.resized.child->u.contig.child->extent;
272 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride3));
273 }
274
yaksuri_cudai_pack_hindexed_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)275 void yaksuri_cudai_pack_hindexed_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
276 {
277 void *args[] = { &inbuf, &outbuf, &count, &md };
278 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_resized_contig__Bool,
279 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
280 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
281 }
282
yaksuri_cudai_kernel_unpack_hindexed_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)283 __global__ void yaksuri_cudai_kernel_unpack_hindexed_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
284 {
285 const char *__restrict__ sbuf = (const char *) inbuf;
286 char *__restrict__ dbuf = (char *) outbuf;
287 uintptr_t extent = md->extent;
288 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
289 uintptr_t res = idx;
290 uintptr_t inner_elements = md->num_elements;
291
292 if (idx >= (count * inner_elements))
293 return;
294
295 uintptr_t x0 = res / inner_elements;
296 res %= inner_elements;
297 inner_elements /= md->u.hindexed.count;
298
299 uintptr_t x1;
300 for (int i = 0; i < md->u.hindexed.count; i++) {
301 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
302 md->u.hindexed.child->num_elements;
303 if (res < in_elems) {
304 x1 = i;
305 res %= in_elems;
306 inner_elements = md->u.hindexed.child->num_elements;
307 break;
308 } else {
309 res -= in_elems;
310 }
311 }
312
313 uintptr_t x2 = res / inner_elements;
314 res %= inner_elements;
315 inner_elements /= md->u.hindexed.child->u.resized.child->u.contig.count;
316
317 uintptr_t x3 = res;
318
319 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
320 uintptr_t extent2 = md->u.hindexed.child->extent;
321 intptr_t stride3 = md->u.hindexed.child->u.resized.child->u.contig.child->extent;
322 *((_Bool *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
323 }
324
yaksuri_cudai_unpack_hindexed_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)325 void yaksuri_cudai_unpack_hindexed_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
326 {
327 void *args[] = { &inbuf, &outbuf, &count, &md };
328 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_resized_contig__Bool,
329 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
330 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
331 }
332
yaksuri_cudai_kernel_pack_contig_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)333 __global__ void yaksuri_cudai_kernel_pack_contig_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
334 {
335 const char *__restrict__ sbuf = (const char *) inbuf;
336 char *__restrict__ dbuf = (char *) outbuf;
337 uintptr_t extent = md->extent;
338 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
339 uintptr_t res = idx;
340 uintptr_t inner_elements = md->num_elements;
341
342 if (idx >= (count * inner_elements))
343 return;
344
345 uintptr_t x0 = res / inner_elements;
346 res %= inner_elements;
347 inner_elements /= md->u.contig.count;
348
349 uintptr_t x1 = res / inner_elements;
350 res %= inner_elements;
351 inner_elements /= md->u.contig.child->u.resized.child->u.contig.count;
352
353 uintptr_t x2 = res;
354
355 intptr_t stride1 = md->u.contig.child->extent;
356 intptr_t stride3 = md->u.contig.child->u.resized.child->u.contig.child->extent;
357 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * stride3));
358 }
359
yaksuri_cudai_pack_contig_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)360 void yaksuri_cudai_pack_contig_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
361 {
362 void *args[] = { &inbuf, &outbuf, &count, &md };
363 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_resized_contig__Bool,
364 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
365 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
366 }
367
yaksuri_cudai_kernel_unpack_contig_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)368 __global__ void yaksuri_cudai_kernel_unpack_contig_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
369 {
370 const char *__restrict__ sbuf = (const char *) inbuf;
371 char *__restrict__ dbuf = (char *) outbuf;
372 uintptr_t extent = md->extent;
373 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
374 uintptr_t res = idx;
375 uintptr_t inner_elements = md->num_elements;
376
377 if (idx >= (count * inner_elements))
378 return;
379
380 uintptr_t x0 = res / inner_elements;
381 res %= inner_elements;
382 inner_elements /= md->u.contig.count;
383
384 uintptr_t x1 = res / inner_elements;
385 res %= inner_elements;
386 inner_elements /= md->u.contig.child->u.resized.child->u.contig.count;
387
388 uintptr_t x2 = res;
389
390 intptr_t stride1 = md->u.contig.child->extent;
391 intptr_t stride3 = md->u.contig.child->u.resized.child->u.contig.child->extent;
392 *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * stride3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
393 }
394
yaksuri_cudai_unpack_contig_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)395 void yaksuri_cudai_unpack_contig_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
396 {
397 void *args[] = { &inbuf, &outbuf, &count, &md };
398 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_resized_contig__Bool,
399 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
400 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
401 }
402
yaksuri_cudai_kernel_pack_resized_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)403 __global__ void yaksuri_cudai_kernel_pack_resized_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
404 {
405 const char *__restrict__ sbuf = (const char *) inbuf;
406 char *__restrict__ dbuf = (char *) outbuf;
407 uintptr_t extent = md->extent;
408 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
409 uintptr_t res = idx;
410 uintptr_t inner_elements = md->num_elements;
411
412 if (idx >= (count * inner_elements))
413 return;
414
415 uintptr_t x0 = res / inner_elements;
416 res %= inner_elements;
417 inner_elements /= md->u.resized.child->u.resized.child->u.contig.count;
418
419 uintptr_t x1 = res;
420
421 intptr_t stride3 = md->u.resized.child->u.resized.child->u.contig.child->extent;
422 *((_Bool *) (void *) (dbuf + idx * sizeof(_Bool))) = *((const _Bool *) (const void *) (sbuf + x0 * extent + x1 * stride3));
423 }
424
yaksuri_cudai_pack_resized_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)425 void yaksuri_cudai_pack_resized_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
426 {
427 void *args[] = { &inbuf, &outbuf, &count, &md };
428 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_resized_contig__Bool,
429 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
430 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
431 }
432
yaksuri_cudai_kernel_unpack_resized_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)433 __global__ void yaksuri_cudai_kernel_unpack_resized_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
434 {
435 const char *__restrict__ sbuf = (const char *) inbuf;
436 char *__restrict__ dbuf = (char *) outbuf;
437 uintptr_t extent = md->extent;
438 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
439 uintptr_t res = idx;
440 uintptr_t inner_elements = md->num_elements;
441
442 if (idx >= (count * inner_elements))
443 return;
444
445 uintptr_t x0 = res / inner_elements;
446 res %= inner_elements;
447 inner_elements /= md->u.resized.child->u.resized.child->u.contig.count;
448
449 uintptr_t x1 = res;
450
451 intptr_t stride3 = md->u.resized.child->u.resized.child->u.contig.child->extent;
452 *((_Bool *) (void *) (dbuf + x0 * extent + x1 * stride3)) = *((const _Bool *) (const void *) (sbuf + idx * sizeof(_Bool)));
453 }
454
yaksuri_cudai_unpack_resized_resized_contig__Bool(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)455 void yaksuri_cudai_unpack_resized_resized_contig__Bool(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
456 {
457 void *args[] = { &inbuf, &outbuf, &count, &md };
458 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_resized_contig__Bool,
459 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
460 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
461 }
462
463