1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.contig.count;
32
33 uintptr_t x1 = res / inner_elements;
34 res %= inner_elements;
35 inner_elements /= md->u.contig.child->u.blkhindx.count;
36
37 uintptr_t x2 = res / inner_elements;
38 res %= inner_elements;
39 inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
40 uintptr_t x3 = res;
41
42 intptr_t stride1 = md->u.contig.child->extent;
43 intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
44 *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * sizeof(float)));
45 }
46
yaksuri_cudai_pack_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)47 void yaksuri_cudai_pack_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
48 {
49 void *args[] = { &inbuf, &outbuf, &count, &md };
50 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_blkhindx_float,
51 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
52 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
53 }
54
yaksuri_cudai_kernel_unpack_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)55 __global__ void yaksuri_cudai_kernel_unpack_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
56 {
57 const char *__restrict__ sbuf = (const char *) inbuf;
58 char *__restrict__ dbuf = (char *) outbuf;
59 uintptr_t extent = md->extent;
60 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
61 uintptr_t res = idx;
62 uintptr_t inner_elements = md->num_elements;
63
64 if (idx >= (count * inner_elements))
65 return;
66
67 uintptr_t x0 = res / inner_elements;
68 res %= inner_elements;
69 inner_elements /= md->u.contig.count;
70
71 uintptr_t x1 = res / inner_elements;
72 res %= inner_elements;
73 inner_elements /= md->u.contig.child->u.blkhindx.count;
74
75 uintptr_t x2 = res / inner_elements;
76 res %= inner_elements;
77 inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
78 uintptr_t x3 = res;
79
80 intptr_t stride1 = md->u.contig.child->extent;
81 intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
82 *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
83 }
84
yaksuri_cudai_unpack_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)85 void yaksuri_cudai_unpack_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
86 {
87 void *args[] = { &inbuf, &outbuf, &count, &md };
88 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_blkhindx_float,
89 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
90 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
91 }
92
yaksuri_cudai_kernel_pack_hvector_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)93 __global__ void yaksuri_cudai_kernel_pack_hvector_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
94 {
95 const char *__restrict__ sbuf = (const char *) inbuf;
96 char *__restrict__ dbuf = (char *) outbuf;
97 uintptr_t extent = md->extent;
98 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
99 uintptr_t res = idx;
100 uintptr_t inner_elements = md->num_elements;
101
102 if (idx >= (count * inner_elements))
103 return;
104
105 uintptr_t x0 = res / inner_elements;
106 res %= inner_elements;
107 inner_elements /= md->u.hvector.count;
108
109 uintptr_t x1 = res / inner_elements;
110 res %= inner_elements;
111 inner_elements /= md->u.hvector.blocklength;
112 uintptr_t x2 = res / inner_elements;
113 res %= inner_elements;
114 inner_elements /= md->u.hvector.child->u.contig.count;
115
116 uintptr_t x3 = res / inner_elements;
117 res %= inner_elements;
118 inner_elements /= md->u.hvector.child->u.contig.child->u.blkhindx.count;
119
120 uintptr_t x4 = res / inner_elements;
121 res %= inner_elements;
122 inner_elements /= md->u.hvector.child->u.contig.child->u.blkhindx.blocklength;
123 uintptr_t x5 = res;
124
125 intptr_t stride1 = md->u.hvector.stride;
126 intptr_t stride2 = md->u.hvector.child->u.contig.child->extent;
127 uintptr_t extent2 = md->u.hvector.child->extent;
128 intptr_t *array_of_displs3 = md->u.hvector.child->u.contig.child->u.blkhindx.array_of_displs;
129 *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(float)));
130 }
131
yaksuri_cudai_pack_hvector_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)132 void yaksuri_cudai_pack_hvector_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
133 {
134 void *args[] = { &inbuf, &outbuf, &count, &md };
135 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_contig_blkhindx_float,
136 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
137 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
138 }
139
yaksuri_cudai_kernel_unpack_hvector_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)140 __global__ void yaksuri_cudai_kernel_unpack_hvector_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
141 {
142 const char *__restrict__ sbuf = (const char *) inbuf;
143 char *__restrict__ dbuf = (char *) outbuf;
144 uintptr_t extent = md->extent;
145 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
146 uintptr_t res = idx;
147 uintptr_t inner_elements = md->num_elements;
148
149 if (idx >= (count * inner_elements))
150 return;
151
152 uintptr_t x0 = res / inner_elements;
153 res %= inner_elements;
154 inner_elements /= md->u.hvector.count;
155
156 uintptr_t x1 = res / inner_elements;
157 res %= inner_elements;
158 inner_elements /= md->u.hvector.blocklength;
159 uintptr_t x2 = res / inner_elements;
160 res %= inner_elements;
161 inner_elements /= md->u.hvector.child->u.contig.count;
162
163 uintptr_t x3 = res / inner_elements;
164 res %= inner_elements;
165 inner_elements /= md->u.hvector.child->u.contig.child->u.blkhindx.count;
166
167 uintptr_t x4 = res / inner_elements;
168 res %= inner_elements;
169 inner_elements /= md->u.hvector.child->u.contig.child->u.blkhindx.blocklength;
170 uintptr_t x5 = res;
171
172 intptr_t stride1 = md->u.hvector.stride;
173 intptr_t stride2 = md->u.hvector.child->u.contig.child->extent;
174 uintptr_t extent2 = md->u.hvector.child->extent;
175 intptr_t *array_of_displs3 = md->u.hvector.child->u.contig.child->u.blkhindx.array_of_displs;
176 *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
177 }
178
yaksuri_cudai_unpack_hvector_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)179 void yaksuri_cudai_unpack_hvector_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
180 {
181 void *args[] = { &inbuf, &outbuf, &count, &md };
182 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_contig_blkhindx_float,
183 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
184 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
185 }
186
yaksuri_cudai_kernel_pack_blkhindx_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)187 __global__ void yaksuri_cudai_kernel_pack_blkhindx_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
188 {
189 const char *__restrict__ sbuf = (const char *) inbuf;
190 char *__restrict__ dbuf = (char *) outbuf;
191 uintptr_t extent = md->extent;
192 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
193 uintptr_t res = idx;
194 uintptr_t inner_elements = md->num_elements;
195
196 if (idx >= (count * inner_elements))
197 return;
198
199 uintptr_t x0 = res / inner_elements;
200 res %= inner_elements;
201 inner_elements /= md->u.blkhindx.count;
202
203 uintptr_t x1 = res / inner_elements;
204 res %= inner_elements;
205 inner_elements /= md->u.blkhindx.blocklength;
206 uintptr_t x2 = res / inner_elements;
207 res %= inner_elements;
208 inner_elements /= md->u.blkhindx.child->u.contig.count;
209
210 uintptr_t x3 = res / inner_elements;
211 res %= inner_elements;
212 inner_elements /= md->u.blkhindx.child->u.contig.child->u.blkhindx.count;
213
214 uintptr_t x4 = res / inner_elements;
215 res %= inner_elements;
216 inner_elements /= md->u.blkhindx.child->u.contig.child->u.blkhindx.blocklength;
217 uintptr_t x5 = res;
218
219 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
220 intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
221 uintptr_t extent2 = md->u.blkhindx.child->extent;
222 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.contig.child->u.blkhindx.array_of_displs;
223 *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(float)));
224 }
225
yaksuri_cudai_pack_blkhindx_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)226 void yaksuri_cudai_pack_blkhindx_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
227 {
228 void *args[] = { &inbuf, &outbuf, &count, &md };
229 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_contig_blkhindx_float,
230 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
231 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
232 }
233
yaksuri_cudai_kernel_unpack_blkhindx_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)234 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
235 {
236 const char *__restrict__ sbuf = (const char *) inbuf;
237 char *__restrict__ dbuf = (char *) outbuf;
238 uintptr_t extent = md->extent;
239 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
240 uintptr_t res = idx;
241 uintptr_t inner_elements = md->num_elements;
242
243 if (idx >= (count * inner_elements))
244 return;
245
246 uintptr_t x0 = res / inner_elements;
247 res %= inner_elements;
248 inner_elements /= md->u.blkhindx.count;
249
250 uintptr_t x1 = res / inner_elements;
251 res %= inner_elements;
252 inner_elements /= md->u.blkhindx.blocklength;
253 uintptr_t x2 = res / inner_elements;
254 res %= inner_elements;
255 inner_elements /= md->u.blkhindx.child->u.contig.count;
256
257 uintptr_t x3 = res / inner_elements;
258 res %= inner_elements;
259 inner_elements /= md->u.blkhindx.child->u.contig.child->u.blkhindx.count;
260
261 uintptr_t x4 = res / inner_elements;
262 res %= inner_elements;
263 inner_elements /= md->u.blkhindx.child->u.contig.child->u.blkhindx.blocklength;
264 uintptr_t x5 = res;
265
266 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
267 intptr_t stride2 = md->u.blkhindx.child->u.contig.child->extent;
268 uintptr_t extent2 = md->u.blkhindx.child->extent;
269 intptr_t *array_of_displs3 = md->u.blkhindx.child->u.contig.child->u.blkhindx.array_of_displs;
270 *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
271 }
272
yaksuri_cudai_unpack_blkhindx_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)273 void yaksuri_cudai_unpack_blkhindx_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
274 {
275 void *args[] = { &inbuf, &outbuf, &count, &md };
276 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_contig_blkhindx_float,
277 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
278 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
279 }
280
yaksuri_cudai_kernel_pack_hindexed_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)281 __global__ void yaksuri_cudai_kernel_pack_hindexed_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
282 {
283 const char *__restrict__ sbuf = (const char *) inbuf;
284 char *__restrict__ dbuf = (char *) outbuf;
285 uintptr_t extent = md->extent;
286 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
287 uintptr_t res = idx;
288 uintptr_t inner_elements = md->num_elements;
289
290 if (idx >= (count * inner_elements))
291 return;
292
293 uintptr_t x0 = res / inner_elements;
294 res %= inner_elements;
295 inner_elements /= md->u.hindexed.count;
296
297 uintptr_t x1;
298 for (int i = 0; i < md->u.hindexed.count; i++) {
299 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
300 md->u.hindexed.child->num_elements;
301 if (res < in_elems) {
302 x1 = i;
303 res %= in_elems;
304 inner_elements = md->u.hindexed.child->num_elements;
305 break;
306 } else {
307 res -= in_elems;
308 }
309 }
310
311 uintptr_t x2 = res / inner_elements;
312 res %= inner_elements;
313 inner_elements /= md->u.hindexed.child->u.contig.count;
314
315 uintptr_t x3 = res / inner_elements;
316 res %= inner_elements;
317 inner_elements /= md->u.hindexed.child->u.contig.child->u.blkhindx.count;
318
319 uintptr_t x4 = res / inner_elements;
320 res %= inner_elements;
321 inner_elements /= md->u.hindexed.child->u.contig.child->u.blkhindx.blocklength;
322 uintptr_t x5 = res;
323
324 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
325 intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
326 uintptr_t extent2 = md->u.hindexed.child->extent;
327 intptr_t *array_of_displs3 = md->u.hindexed.child->u.contig.child->u.blkhindx.array_of_displs;
328 *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(float)));
329 }
330
yaksuri_cudai_pack_hindexed_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)331 void yaksuri_cudai_pack_hindexed_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
332 {
333 void *args[] = { &inbuf, &outbuf, &count, &md };
334 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_contig_blkhindx_float,
335 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
336 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
337 }
338
yaksuri_cudai_kernel_unpack_hindexed_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)339 __global__ void yaksuri_cudai_kernel_unpack_hindexed_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
340 {
341 const char *__restrict__ sbuf = (const char *) inbuf;
342 char *__restrict__ dbuf = (char *) outbuf;
343 uintptr_t extent = md->extent;
344 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
345 uintptr_t res = idx;
346 uintptr_t inner_elements = md->num_elements;
347
348 if (idx >= (count * inner_elements))
349 return;
350
351 uintptr_t x0 = res / inner_elements;
352 res %= inner_elements;
353 inner_elements /= md->u.hindexed.count;
354
355 uintptr_t x1;
356 for (int i = 0; i < md->u.hindexed.count; i++) {
357 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
358 md->u.hindexed.child->num_elements;
359 if (res < in_elems) {
360 x1 = i;
361 res %= in_elems;
362 inner_elements = md->u.hindexed.child->num_elements;
363 break;
364 } else {
365 res -= in_elems;
366 }
367 }
368
369 uintptr_t x2 = res / inner_elements;
370 res %= inner_elements;
371 inner_elements /= md->u.hindexed.child->u.contig.count;
372
373 uintptr_t x3 = res / inner_elements;
374 res %= inner_elements;
375 inner_elements /= md->u.hindexed.child->u.contig.child->u.blkhindx.count;
376
377 uintptr_t x4 = res / inner_elements;
378 res %= inner_elements;
379 inner_elements /= md->u.hindexed.child->u.contig.child->u.blkhindx.blocklength;
380 uintptr_t x5 = res;
381
382 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
383 intptr_t stride2 = md->u.hindexed.child->u.contig.child->extent;
384 uintptr_t extent2 = md->u.hindexed.child->extent;
385 intptr_t *array_of_displs3 = md->u.hindexed.child->u.contig.child->u.blkhindx.array_of_displs;
386 *((float *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + array_of_displs3[x4] + x5 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
387 }
388
yaksuri_cudai_unpack_hindexed_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)389 void yaksuri_cudai_unpack_hindexed_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
390 {
391 void *args[] = { &inbuf, &outbuf, &count, &md };
392 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_contig_blkhindx_float,
393 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
394 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
395 }
396
yaksuri_cudai_kernel_pack_contig_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)397 __global__ void yaksuri_cudai_kernel_pack_contig_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
398 {
399 const char *__restrict__ sbuf = (const char *) inbuf;
400 char *__restrict__ dbuf = (char *) outbuf;
401 uintptr_t extent = md->extent;
402 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
403 uintptr_t res = idx;
404 uintptr_t inner_elements = md->num_elements;
405
406 if (idx >= (count * inner_elements))
407 return;
408
409 uintptr_t x0 = res / inner_elements;
410 res %= inner_elements;
411 inner_elements /= md->u.contig.count;
412
413 uintptr_t x1 = res / inner_elements;
414 res %= inner_elements;
415 inner_elements /= md->u.contig.child->u.contig.count;
416
417 uintptr_t x2 = res / inner_elements;
418 res %= inner_elements;
419 inner_elements /= md->u.contig.child->u.contig.child->u.blkhindx.count;
420
421 uintptr_t x3 = res / inner_elements;
422 res %= inner_elements;
423 inner_elements /= md->u.contig.child->u.contig.child->u.blkhindx.blocklength;
424 uintptr_t x4 = res;
425
426 intptr_t stride1 = md->u.contig.child->extent;
427 intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
428 intptr_t *array_of_displs3 = md->u.contig.child->u.contig.child->u.blkhindx.array_of_displs;
429 *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * stride2 + array_of_displs3[x3] + x4 * sizeof(float)));
430 }
431
yaksuri_cudai_pack_contig_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)432 void yaksuri_cudai_pack_contig_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
433 {
434 void *args[] = { &inbuf, &outbuf, &count, &md };
435 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_contig_blkhindx_float,
436 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
437 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
438 }
439
yaksuri_cudai_kernel_unpack_contig_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)440 __global__ void yaksuri_cudai_kernel_unpack_contig_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
441 {
442 const char *__restrict__ sbuf = (const char *) inbuf;
443 char *__restrict__ dbuf = (char *) outbuf;
444 uintptr_t extent = md->extent;
445 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
446 uintptr_t res = idx;
447 uintptr_t inner_elements = md->num_elements;
448
449 if (idx >= (count * inner_elements))
450 return;
451
452 uintptr_t x0 = res / inner_elements;
453 res %= inner_elements;
454 inner_elements /= md->u.contig.count;
455
456 uintptr_t x1 = res / inner_elements;
457 res %= inner_elements;
458 inner_elements /= md->u.contig.child->u.contig.count;
459
460 uintptr_t x2 = res / inner_elements;
461 res %= inner_elements;
462 inner_elements /= md->u.contig.child->u.contig.child->u.blkhindx.count;
463
464 uintptr_t x3 = res / inner_elements;
465 res %= inner_elements;
466 inner_elements /= md->u.contig.child->u.contig.child->u.blkhindx.blocklength;
467 uintptr_t x4 = res;
468
469 intptr_t stride1 = md->u.contig.child->extent;
470 intptr_t stride2 = md->u.contig.child->u.contig.child->extent;
471 intptr_t *array_of_displs3 = md->u.contig.child->u.contig.child->u.blkhindx.array_of_displs;
472 *((float *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * stride2 + array_of_displs3[x3] + x4 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
473 }
474
yaksuri_cudai_unpack_contig_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)475 void yaksuri_cudai_unpack_contig_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
476 {
477 void *args[] = { &inbuf, &outbuf, &count, &md };
478 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_contig_blkhindx_float,
479 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
480 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
481 }
482
yaksuri_cudai_kernel_pack_resized_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)483 __global__ void yaksuri_cudai_kernel_pack_resized_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
484 {
485 const char *__restrict__ sbuf = (const char *) inbuf;
486 char *__restrict__ dbuf = (char *) outbuf;
487 uintptr_t extent = md->extent;
488 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
489 uintptr_t res = idx;
490 uintptr_t inner_elements = md->num_elements;
491
492 if (idx >= (count * inner_elements))
493 return;
494
495 uintptr_t x0 = res / inner_elements;
496 res %= inner_elements;
497 inner_elements /= md->u.resized.child->u.contig.count;
498
499 uintptr_t x1 = res / inner_elements;
500 res %= inner_elements;
501 inner_elements /= md->u.resized.child->u.contig.child->u.blkhindx.count;
502
503 uintptr_t x2 = res / inner_elements;
504 res %= inner_elements;
505 inner_elements /= md->u.resized.child->u.contig.child->u.blkhindx.blocklength;
506 uintptr_t x3 = res;
507
508 intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
509 intptr_t *array_of_displs3 = md->u.resized.child->u.contig.child->u.blkhindx.array_of_displs;
510 *((float *) (void *) (dbuf + idx * sizeof(float))) = *((const float *) (const void *) (sbuf + x0 * extent + x1 * stride2 + array_of_displs3[x2] + x3 * sizeof(float)));
511 }
512
yaksuri_cudai_pack_resized_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)513 void yaksuri_cudai_pack_resized_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
514 {
515 void *args[] = { &inbuf, &outbuf, &count, &md };
516 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_contig_blkhindx_float,
517 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
518 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
519 }
520
yaksuri_cudai_kernel_unpack_resized_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)521 __global__ void yaksuri_cudai_kernel_unpack_resized_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
522 {
523 const char *__restrict__ sbuf = (const char *) inbuf;
524 char *__restrict__ dbuf = (char *) outbuf;
525 uintptr_t extent = md->extent;
526 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
527 uintptr_t res = idx;
528 uintptr_t inner_elements = md->num_elements;
529
530 if (idx >= (count * inner_elements))
531 return;
532
533 uintptr_t x0 = res / inner_elements;
534 res %= inner_elements;
535 inner_elements /= md->u.resized.child->u.contig.count;
536
537 uintptr_t x1 = res / inner_elements;
538 res %= inner_elements;
539 inner_elements /= md->u.resized.child->u.contig.child->u.blkhindx.count;
540
541 uintptr_t x2 = res / inner_elements;
542 res %= inner_elements;
543 inner_elements /= md->u.resized.child->u.contig.child->u.blkhindx.blocklength;
544 uintptr_t x3 = res;
545
546 intptr_t stride2 = md->u.resized.child->u.contig.child->extent;
547 intptr_t *array_of_displs3 = md->u.resized.child->u.contig.child->u.blkhindx.array_of_displs;
548 *((float *) (void *) (dbuf + x0 * extent + x1 * stride2 + array_of_displs3[x2] + x3 * sizeof(float))) = *((const float *) (const void *) (sbuf + idx * sizeof(float)));
549 }
550
yaksuri_cudai_unpack_resized_contig_blkhindx_float(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)551 void yaksuri_cudai_unpack_resized_contig_blkhindx_float(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
552 {
553 void *args[] = { &inbuf, &outbuf, &count, &md };
554 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_contig_blkhindx_float,
555 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
556 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
557 }
558
559