1 /*
2 * Copyright (C) by Argonne National Laboratory
3 * See COPYRIGHT in top-level directory
4 *
5 * DO NOT EDIT: AUTOMATICALLY GENERATED FILE !!
6 */
7
8 #include <string.h>
9 #include <stdint.h>
10 #include <wchar.h>
11 #include <assert.h>
12 #include <cuda.h>
13 #include <cuda_runtime.h>
14 #include "yaksuri_cudai_base.h"
15 #include "yaksuri_cudai_pup.h"
16
yaksuri_cudai_kernel_pack_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)17 __global__ void yaksuri_cudai_kernel_pack_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
18 {
19 const char *__restrict__ sbuf = (const char *) inbuf;
20 char *__restrict__ dbuf = (char *) outbuf;
21 uintptr_t extent = md->extent;
22 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
23 uintptr_t res = idx;
24 uintptr_t inner_elements = md->num_elements;
25
26 if (idx >= (count * inner_elements))
27 return;
28
29 uintptr_t x0 = res / inner_elements;
30 res %= inner_elements;
31 inner_elements /= md->u.blkhindx.count;
32
33 uintptr_t x1 = res / inner_elements;
34 res %= inner_elements;
35 inner_elements /= md->u.blkhindx.blocklength;
36 uintptr_t x2 = res / inner_elements;
37 res %= inner_elements;
38 inner_elements /= md->u.blkhindx.child->u.hvector.count;
39
40 uintptr_t x3 = res / inner_elements;
41 res %= inner_elements;
42 inner_elements /= md->u.blkhindx.child->u.hvector.blocklength;
43 uintptr_t x4 = res;
44
45 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
46 intptr_t stride2 = md->u.blkhindx.child->u.hvector.stride;
47 uintptr_t extent2 = md->u.blkhindx.child->extent;
48 *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * sizeof(char)));
49 }
50
yaksuri_cudai_pack_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)51 void yaksuri_cudai_pack_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
52 {
53 void *args[] = { &inbuf, &outbuf, &count, &md };
54 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_hvector_char,
55 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
56 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
57 }
58
yaksuri_cudai_kernel_unpack_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)59 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
60 {
61 const char *__restrict__ sbuf = (const char *) inbuf;
62 char *__restrict__ dbuf = (char *) outbuf;
63 uintptr_t extent = md->extent;
64 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
65 uintptr_t res = idx;
66 uintptr_t inner_elements = md->num_elements;
67
68 if (idx >= (count * inner_elements))
69 return;
70
71 uintptr_t x0 = res / inner_elements;
72 res %= inner_elements;
73 inner_elements /= md->u.blkhindx.count;
74
75 uintptr_t x1 = res / inner_elements;
76 res %= inner_elements;
77 inner_elements /= md->u.blkhindx.blocklength;
78 uintptr_t x2 = res / inner_elements;
79 res %= inner_elements;
80 inner_elements /= md->u.blkhindx.child->u.hvector.count;
81
82 uintptr_t x3 = res / inner_elements;
83 res %= inner_elements;
84 inner_elements /= md->u.blkhindx.child->u.hvector.blocklength;
85 uintptr_t x4 = res;
86
87 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
88 intptr_t stride2 = md->u.blkhindx.child->u.hvector.stride;
89 uintptr_t extent2 = md->u.blkhindx.child->extent;
90 *((char *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + x3 * stride2 + x4 * sizeof(char))) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
91 }
92
yaksuri_cudai_unpack_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)93 void yaksuri_cudai_unpack_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
94 {
95 void *args[] = { &inbuf, &outbuf, &count, &md };
96 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_hvector_char,
97 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
98 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
99 }
100
yaksuri_cudai_kernel_pack_hvector_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)101 __global__ void yaksuri_cudai_kernel_pack_hvector_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
102 {
103 const char *__restrict__ sbuf = (const char *) inbuf;
104 char *__restrict__ dbuf = (char *) outbuf;
105 uintptr_t extent = md->extent;
106 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
107 uintptr_t res = idx;
108 uintptr_t inner_elements = md->num_elements;
109
110 if (idx >= (count * inner_elements))
111 return;
112
113 uintptr_t x0 = res / inner_elements;
114 res %= inner_elements;
115 inner_elements /= md->u.hvector.count;
116
117 uintptr_t x1 = res / inner_elements;
118 res %= inner_elements;
119 inner_elements /= md->u.hvector.blocklength;
120 uintptr_t x2 = res / inner_elements;
121 res %= inner_elements;
122 inner_elements /= md->u.hvector.child->u.blkhindx.count;
123
124 uintptr_t x3 = res / inner_elements;
125 res %= inner_elements;
126 inner_elements /= md->u.hvector.child->u.blkhindx.blocklength;
127 uintptr_t x4 = res / inner_elements;
128 res %= inner_elements;
129 inner_elements /= md->u.hvector.child->u.blkhindx.child->u.hvector.count;
130
131 uintptr_t x5 = res / inner_elements;
132 res %= inner_elements;
133 inner_elements /= md->u.hvector.child->u.blkhindx.child->u.hvector.blocklength;
134 uintptr_t x6 = res;
135
136 intptr_t stride1 = md->u.hvector.stride;
137 intptr_t *array_of_displs2 = md->u.hvector.child->u.blkhindx.array_of_displs;
138 uintptr_t extent2 = md->u.hvector.child->extent;
139 intptr_t stride3 = md->u.hvector.child->u.blkhindx.child->u.hvector.stride;
140 uintptr_t extent3 = md->u.hvector.child->u.blkhindx.child->extent;
141 *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3 + x6 * sizeof(char)));
142 }
143
yaksuri_cudai_pack_hvector_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)144 void yaksuri_cudai_pack_hvector_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
145 {
146 void *args[] = { &inbuf, &outbuf, &count, &md };
147 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hvector_blkhindx_hvector_char,
148 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
149 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
150 }
151
yaksuri_cudai_kernel_unpack_hvector_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)152 __global__ void yaksuri_cudai_kernel_unpack_hvector_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
153 {
154 const char *__restrict__ sbuf = (const char *) inbuf;
155 char *__restrict__ dbuf = (char *) outbuf;
156 uintptr_t extent = md->extent;
157 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
158 uintptr_t res = idx;
159 uintptr_t inner_elements = md->num_elements;
160
161 if (idx >= (count * inner_elements))
162 return;
163
164 uintptr_t x0 = res / inner_elements;
165 res %= inner_elements;
166 inner_elements /= md->u.hvector.count;
167
168 uintptr_t x1 = res / inner_elements;
169 res %= inner_elements;
170 inner_elements /= md->u.hvector.blocklength;
171 uintptr_t x2 = res / inner_elements;
172 res %= inner_elements;
173 inner_elements /= md->u.hvector.child->u.blkhindx.count;
174
175 uintptr_t x3 = res / inner_elements;
176 res %= inner_elements;
177 inner_elements /= md->u.hvector.child->u.blkhindx.blocklength;
178 uintptr_t x4 = res / inner_elements;
179 res %= inner_elements;
180 inner_elements /= md->u.hvector.child->u.blkhindx.child->u.hvector.count;
181
182 uintptr_t x5 = res / inner_elements;
183 res %= inner_elements;
184 inner_elements /= md->u.hvector.child->u.blkhindx.child->u.hvector.blocklength;
185 uintptr_t x6 = res;
186
187 intptr_t stride1 = md->u.hvector.stride;
188 intptr_t *array_of_displs2 = md->u.hvector.child->u.blkhindx.array_of_displs;
189 uintptr_t extent2 = md->u.hvector.child->extent;
190 intptr_t stride3 = md->u.hvector.child->u.blkhindx.child->u.hvector.stride;
191 uintptr_t extent3 = md->u.hvector.child->u.blkhindx.child->extent;
192 *((char *) (void *) (dbuf + x0 * extent + x1 * stride1 + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3 + x6 * sizeof(char))) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
193 }
194
yaksuri_cudai_unpack_hvector_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)195 void yaksuri_cudai_unpack_hvector_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
196 {
197 void *args[] = { &inbuf, &outbuf, &count, &md };
198 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hvector_blkhindx_hvector_char,
199 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
200 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
201 }
202
yaksuri_cudai_kernel_pack_blkhindx_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)203 __global__ void yaksuri_cudai_kernel_pack_blkhindx_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
204 {
205 const char *__restrict__ sbuf = (const char *) inbuf;
206 char *__restrict__ dbuf = (char *) outbuf;
207 uintptr_t extent = md->extent;
208 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
209 uintptr_t res = idx;
210 uintptr_t inner_elements = md->num_elements;
211
212 if (idx >= (count * inner_elements))
213 return;
214
215 uintptr_t x0 = res / inner_elements;
216 res %= inner_elements;
217 inner_elements /= md->u.blkhindx.count;
218
219 uintptr_t x1 = res / inner_elements;
220 res %= inner_elements;
221 inner_elements /= md->u.blkhindx.blocklength;
222 uintptr_t x2 = res / inner_elements;
223 res %= inner_elements;
224 inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
225
226 uintptr_t x3 = res / inner_elements;
227 res %= inner_elements;
228 inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
229 uintptr_t x4 = res / inner_elements;
230 res %= inner_elements;
231 inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.hvector.count;
232
233 uintptr_t x5 = res / inner_elements;
234 res %= inner_elements;
235 inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.hvector.blocklength;
236 uintptr_t x6 = res;
237
238 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
239 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
240 uintptr_t extent2 = md->u.blkhindx.child->extent;
241 intptr_t stride3 = md->u.blkhindx.child->u.blkhindx.child->u.hvector.stride;
242 uintptr_t extent3 = md->u.blkhindx.child->u.blkhindx.child->extent;
243 *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3 + x6 * sizeof(char)));
244 }
245
yaksuri_cudai_pack_blkhindx_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)246 void yaksuri_cudai_pack_blkhindx_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
247 {
248 void *args[] = { &inbuf, &outbuf, &count, &md };
249 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_blkhindx_blkhindx_hvector_char,
250 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
251 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
252 }
253
yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)254 __global__ void yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
255 {
256 const char *__restrict__ sbuf = (const char *) inbuf;
257 char *__restrict__ dbuf = (char *) outbuf;
258 uintptr_t extent = md->extent;
259 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
260 uintptr_t res = idx;
261 uintptr_t inner_elements = md->num_elements;
262
263 if (idx >= (count * inner_elements))
264 return;
265
266 uintptr_t x0 = res / inner_elements;
267 res %= inner_elements;
268 inner_elements /= md->u.blkhindx.count;
269
270 uintptr_t x1 = res / inner_elements;
271 res %= inner_elements;
272 inner_elements /= md->u.blkhindx.blocklength;
273 uintptr_t x2 = res / inner_elements;
274 res %= inner_elements;
275 inner_elements /= md->u.blkhindx.child->u.blkhindx.count;
276
277 uintptr_t x3 = res / inner_elements;
278 res %= inner_elements;
279 inner_elements /= md->u.blkhindx.child->u.blkhindx.blocklength;
280 uintptr_t x4 = res / inner_elements;
281 res %= inner_elements;
282 inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.hvector.count;
283
284 uintptr_t x5 = res / inner_elements;
285 res %= inner_elements;
286 inner_elements /= md->u.blkhindx.child->u.blkhindx.child->u.hvector.blocklength;
287 uintptr_t x6 = res;
288
289 intptr_t *array_of_displs1 = md->u.blkhindx.array_of_displs;
290 intptr_t *array_of_displs2 = md->u.blkhindx.child->u.blkhindx.array_of_displs;
291 uintptr_t extent2 = md->u.blkhindx.child->extent;
292 intptr_t stride3 = md->u.blkhindx.child->u.blkhindx.child->u.hvector.stride;
293 uintptr_t extent3 = md->u.blkhindx.child->u.blkhindx.child->extent;
294 *((char *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3 + x6 * sizeof(char))) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
295 }
296
yaksuri_cudai_unpack_blkhindx_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)297 void yaksuri_cudai_unpack_blkhindx_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
298 {
299 void *args[] = { &inbuf, &outbuf, &count, &md };
300 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_blkhindx_blkhindx_hvector_char,
301 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
302 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
303 }
304
yaksuri_cudai_kernel_pack_hindexed_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)305 __global__ void yaksuri_cudai_kernel_pack_hindexed_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
306 {
307 const char *__restrict__ sbuf = (const char *) inbuf;
308 char *__restrict__ dbuf = (char *) outbuf;
309 uintptr_t extent = md->extent;
310 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
311 uintptr_t res = idx;
312 uintptr_t inner_elements = md->num_elements;
313
314 if (idx >= (count * inner_elements))
315 return;
316
317 uintptr_t x0 = res / inner_elements;
318 res %= inner_elements;
319 inner_elements /= md->u.hindexed.count;
320
321 uintptr_t x1;
322 for (int i = 0; i < md->u.hindexed.count; i++) {
323 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
324 md->u.hindexed.child->num_elements;
325 if (res < in_elems) {
326 x1 = i;
327 res %= in_elems;
328 inner_elements = md->u.hindexed.child->num_elements;
329 break;
330 } else {
331 res -= in_elems;
332 }
333 }
334
335 uintptr_t x2 = res / inner_elements;
336 res %= inner_elements;
337 inner_elements /= md->u.hindexed.child->u.blkhindx.count;
338
339 uintptr_t x3 = res / inner_elements;
340 res %= inner_elements;
341 inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
342 uintptr_t x4 = res / inner_elements;
343 res %= inner_elements;
344 inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.hvector.count;
345
346 uintptr_t x5 = res / inner_elements;
347 res %= inner_elements;
348 inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.hvector.blocklength;
349 uintptr_t x6 = res;
350
351 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
352 intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
353 uintptr_t extent2 = md->u.hindexed.child->extent;
354 intptr_t stride3 = md->u.hindexed.child->u.blkhindx.child->u.hvector.stride;
355 uintptr_t extent3 = md->u.hindexed.child->u.blkhindx.child->extent;
356 *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3 + x6 * sizeof(char)));
357 }
358
yaksuri_cudai_pack_hindexed_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)359 void yaksuri_cudai_pack_hindexed_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
360 {
361 void *args[] = { &inbuf, &outbuf, &count, &md };
362 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_hindexed_blkhindx_hvector_char,
363 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
364 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
365 }
366
yaksuri_cudai_kernel_unpack_hindexed_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)367 __global__ void yaksuri_cudai_kernel_unpack_hindexed_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
368 {
369 const char *__restrict__ sbuf = (const char *) inbuf;
370 char *__restrict__ dbuf = (char *) outbuf;
371 uintptr_t extent = md->extent;
372 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
373 uintptr_t res = idx;
374 uintptr_t inner_elements = md->num_elements;
375
376 if (idx >= (count * inner_elements))
377 return;
378
379 uintptr_t x0 = res / inner_elements;
380 res %= inner_elements;
381 inner_elements /= md->u.hindexed.count;
382
383 uintptr_t x1;
384 for (int i = 0; i < md->u.hindexed.count; i++) {
385 uintptr_t in_elems = md->u.hindexed.array_of_blocklengths[i] *
386 md->u.hindexed.child->num_elements;
387 if (res < in_elems) {
388 x1 = i;
389 res %= in_elems;
390 inner_elements = md->u.hindexed.child->num_elements;
391 break;
392 } else {
393 res -= in_elems;
394 }
395 }
396
397 uintptr_t x2 = res / inner_elements;
398 res %= inner_elements;
399 inner_elements /= md->u.hindexed.child->u.blkhindx.count;
400
401 uintptr_t x3 = res / inner_elements;
402 res %= inner_elements;
403 inner_elements /= md->u.hindexed.child->u.blkhindx.blocklength;
404 uintptr_t x4 = res / inner_elements;
405 res %= inner_elements;
406 inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.hvector.count;
407
408 uintptr_t x5 = res / inner_elements;
409 res %= inner_elements;
410 inner_elements /= md->u.hindexed.child->u.blkhindx.child->u.hvector.blocklength;
411 uintptr_t x6 = res;
412
413 intptr_t *array_of_displs1 = md->u.hindexed.array_of_displs;
414 intptr_t *array_of_displs2 = md->u.hindexed.child->u.blkhindx.array_of_displs;
415 uintptr_t extent2 = md->u.hindexed.child->extent;
416 intptr_t stride3 = md->u.hindexed.child->u.blkhindx.child->u.hvector.stride;
417 uintptr_t extent3 = md->u.hindexed.child->u.blkhindx.child->extent;
418 *((char *) (void *) (dbuf + x0 * extent + array_of_displs1[x1] + x2 * extent2 + array_of_displs2[x3] + x4 * extent3 + x5 * stride3 + x6 * sizeof(char))) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
419 }
420
yaksuri_cudai_unpack_hindexed_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)421 void yaksuri_cudai_unpack_hindexed_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
422 {
423 void *args[] = { &inbuf, &outbuf, &count, &md };
424 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_hindexed_blkhindx_hvector_char,
425 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
426 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
427 }
428
yaksuri_cudai_kernel_pack_contig_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)429 __global__ void yaksuri_cudai_kernel_pack_contig_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
430 {
431 const char *__restrict__ sbuf = (const char *) inbuf;
432 char *__restrict__ dbuf = (char *) outbuf;
433 uintptr_t extent = md->extent;
434 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
435 uintptr_t res = idx;
436 uintptr_t inner_elements = md->num_elements;
437
438 if (idx >= (count * inner_elements))
439 return;
440
441 uintptr_t x0 = res / inner_elements;
442 res %= inner_elements;
443 inner_elements /= md->u.contig.count;
444
445 uintptr_t x1 = res / inner_elements;
446 res %= inner_elements;
447 inner_elements /= md->u.contig.child->u.blkhindx.count;
448
449 uintptr_t x2 = res / inner_elements;
450 res %= inner_elements;
451 inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
452 uintptr_t x3 = res / inner_elements;
453 res %= inner_elements;
454 inner_elements /= md->u.contig.child->u.blkhindx.child->u.hvector.count;
455
456 uintptr_t x4 = res / inner_elements;
457 res %= inner_elements;
458 inner_elements /= md->u.contig.child->u.blkhindx.child->u.hvector.blocklength;
459 uintptr_t x5 = res;
460
461 intptr_t stride1 = md->u.contig.child->extent;
462 intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
463 intptr_t stride3 = md->u.contig.child->u.blkhindx.child->u.hvector.stride;
464 uintptr_t extent3 = md->u.contig.child->u.blkhindx.child->extent;
465 *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + x4 * stride3 + x5 * sizeof(char)));
466 }
467
yaksuri_cudai_pack_contig_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)468 void yaksuri_cudai_pack_contig_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
469 {
470 void *args[] = { &inbuf, &outbuf, &count, &md };
471 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_contig_blkhindx_hvector_char,
472 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
473 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
474 }
475
yaksuri_cudai_kernel_unpack_contig_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)476 __global__ void yaksuri_cudai_kernel_unpack_contig_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
477 {
478 const char *__restrict__ sbuf = (const char *) inbuf;
479 char *__restrict__ dbuf = (char *) outbuf;
480 uintptr_t extent = md->extent;
481 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
482 uintptr_t res = idx;
483 uintptr_t inner_elements = md->num_elements;
484
485 if (idx >= (count * inner_elements))
486 return;
487
488 uintptr_t x0 = res / inner_elements;
489 res %= inner_elements;
490 inner_elements /= md->u.contig.count;
491
492 uintptr_t x1 = res / inner_elements;
493 res %= inner_elements;
494 inner_elements /= md->u.contig.child->u.blkhindx.count;
495
496 uintptr_t x2 = res / inner_elements;
497 res %= inner_elements;
498 inner_elements /= md->u.contig.child->u.blkhindx.blocklength;
499 uintptr_t x3 = res / inner_elements;
500 res %= inner_elements;
501 inner_elements /= md->u.contig.child->u.blkhindx.child->u.hvector.count;
502
503 uintptr_t x4 = res / inner_elements;
504 res %= inner_elements;
505 inner_elements /= md->u.contig.child->u.blkhindx.child->u.hvector.blocklength;
506 uintptr_t x5 = res;
507
508 intptr_t stride1 = md->u.contig.child->extent;
509 intptr_t *array_of_displs2 = md->u.contig.child->u.blkhindx.array_of_displs;
510 intptr_t stride3 = md->u.contig.child->u.blkhindx.child->u.hvector.stride;
511 uintptr_t extent3 = md->u.contig.child->u.blkhindx.child->extent;
512 *((char *) (void *) (dbuf + x0 * extent + x1 * stride1 + array_of_displs2[x2] + x3 * extent3 + x4 * stride3 + x5 * sizeof(char))) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
513 }
514
yaksuri_cudai_unpack_contig_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)515 void yaksuri_cudai_unpack_contig_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
516 {
517 void *args[] = { &inbuf, &outbuf, &count, &md };
518 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_contig_blkhindx_hvector_char,
519 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
520 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
521 }
522
yaksuri_cudai_kernel_pack_resized_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)523 __global__ void yaksuri_cudai_kernel_pack_resized_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
524 {
525 const char *__restrict__ sbuf = (const char *) inbuf;
526 char *__restrict__ dbuf = (char *) outbuf;
527 uintptr_t extent = md->extent;
528 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
529 uintptr_t res = idx;
530 uintptr_t inner_elements = md->num_elements;
531
532 if (idx >= (count * inner_elements))
533 return;
534
535 uintptr_t x0 = res / inner_elements;
536 res %= inner_elements;
537 inner_elements /= md->u.resized.child->u.blkhindx.count;
538
539 uintptr_t x1 = res / inner_elements;
540 res %= inner_elements;
541 inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
542 uintptr_t x2 = res / inner_elements;
543 res %= inner_elements;
544 inner_elements /= md->u.resized.child->u.blkhindx.child->u.hvector.count;
545
546 uintptr_t x3 = res / inner_elements;
547 res %= inner_elements;
548 inner_elements /= md->u.resized.child->u.blkhindx.child->u.hvector.blocklength;
549 uintptr_t x4 = res;
550
551 intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
552 intptr_t stride3 = md->u.resized.child->u.blkhindx.child->u.hvector.stride;
553 uintptr_t extent3 = md->u.resized.child->u.blkhindx.child->extent;
554 *((char *) (void *) (dbuf + idx * sizeof(char))) = *((const char *) (const void *) (sbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + x3 * stride3 + x4 * sizeof(char)));
555 }
556
yaksuri_cudai_pack_resized_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)557 void yaksuri_cudai_pack_resized_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
558 {
559 void *args[] = { &inbuf, &outbuf, &count, &md };
560 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_pack_resized_blkhindx_hvector_char,
561 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
562 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
563 }
564
yaksuri_cudai_kernel_unpack_resized_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,const yaksuri_cudai_md_s * __restrict__ md)565 __global__ void yaksuri_cudai_kernel_unpack_resized_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, const yaksuri_cudai_md_s *__restrict__ md)
566 {
567 const char *__restrict__ sbuf = (const char *) inbuf;
568 char *__restrict__ dbuf = (char *) outbuf;
569 uintptr_t extent = md->extent;
570 uintptr_t idx = blockIdx.x * blockDim.x + threadIdx.x;
571 uintptr_t res = idx;
572 uintptr_t inner_elements = md->num_elements;
573
574 if (idx >= (count * inner_elements))
575 return;
576
577 uintptr_t x0 = res / inner_elements;
578 res %= inner_elements;
579 inner_elements /= md->u.resized.child->u.blkhindx.count;
580
581 uintptr_t x1 = res / inner_elements;
582 res %= inner_elements;
583 inner_elements /= md->u.resized.child->u.blkhindx.blocklength;
584 uintptr_t x2 = res / inner_elements;
585 res %= inner_elements;
586 inner_elements /= md->u.resized.child->u.blkhindx.child->u.hvector.count;
587
588 uintptr_t x3 = res / inner_elements;
589 res %= inner_elements;
590 inner_elements /= md->u.resized.child->u.blkhindx.child->u.hvector.blocklength;
591 uintptr_t x4 = res;
592
593 intptr_t *array_of_displs2 = md->u.resized.child->u.blkhindx.array_of_displs;
594 intptr_t stride3 = md->u.resized.child->u.blkhindx.child->u.hvector.stride;
595 uintptr_t extent3 = md->u.resized.child->u.blkhindx.child->extent;
596 *((char *) (void *) (dbuf + x0 * extent + array_of_displs2[x1] + x2 * extent3 + x3 * stride3 + x4 * sizeof(char))) = *((const char *) (const void *) (sbuf + idx * sizeof(char)));
597 }
598
yaksuri_cudai_unpack_resized_blkhindx_hvector_char(const void * inbuf,void * outbuf,uintptr_t count,yaksuri_cudai_md_s * md,int n_threads,int n_blocks_x,int n_blocks_y,int n_blocks_z,int device)599 void yaksuri_cudai_unpack_resized_blkhindx_hvector_char(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)
600 {
601 void *args[] = { &inbuf, &outbuf, &count, &md };
602 cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_unpack_resized_blkhindx_hvector_char,
603 dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);
604 YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);
605 }
606
607